# Data Mining Practice Project

#### This project was written by:
- Kyrsti Fitts
- Shivani Merchant
- Kevin Reynolds
- Ryan Espejo

<br><br><br>
##### Step 1: Load the data
- For the first step of this project, we will load the data from the data files.
- The data will be loaded into a pandas data frame.
- The data frame will be the data structure that holds our data.
- Add attribute name rows for the data

In [64]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Load the data (replace with the actual paths to your data files) and create attribute name row
attribute_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
training_data = pd.read_csv('adult/training_data.csv', names = attribute_names)
test_data = pd.read_csv('adult/test_data.csv', names=attribute_names)

# Print number of rows and colums
training_rows_length, training_columns_length = training_data.shape
test_rows_length, test_columns_length = test_data.shape


print(f"Training Data Set: Read in {training_rows_length} rows and {training_columns_length} columns")
print(f"Test Data Set: Read in {test_rows_length} rows and {test_columns_length} columns")

Training Data Set: Read in 32561 rows and 15 columns
Test Data Set: Read in 16281 rows and 15 columns


<br><br>
##### Step 2: Remove rows containing unknown data
- Replace '?' characters with pandas 'NA' objects.
- Use the "dropna()" method to remove the rows.

In [65]:
# Remove records with unknown "?" values
training_data = training_data.replace(' ?', pd.NA)
training_data = training_data.dropna()

test_data = test_data.replace(' ?', pd.NA)
test_data = test_data.dropna()

# Display results
print(f"Training Data Set: Removed {training_rows_length - training_data.shape[0]} rows containing unknown values")
print(f"Test Data Set: Removed {test_rows_length - test_data.shape[0]} rows containing unknown values")

Training Data Set: Removed 2399 rows containing unknown values
Test Data Set: Removed 1221 rows containing unknown values


Step 3: Remove All Continuous Attributes
- Continuous Attrbutes: age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
- Drop these attributes from both pandas data frames

In [66]:
print("Training dataset columns before continuous attribute deletion")
print(training_data.columns)
print("Test dataset columns before continuous attribute deletion")
print(test_data.columns)

training_data.drop('age',inplace = True, axis=1)
training_data.drop('fnlwgt',inplace = True, axis=1)
training_data.drop('education-num',inplace = True, axis=1)
training_data.drop('capital-gain',inplace = True, axis=1)
training_data.drop('capital-loss',inplace = True, axis=1)
training_data.drop('hours-per-week',inplace = True, axis=1)

test_data.drop('age',inplace = True, axis=1)
test_data.drop('fnlwgt',inplace = True, axis=1)
test_data.drop('education-num',inplace = True, axis=1)
test_data.drop('capital-gain',inplace = True, axis=1)
test_data.drop('capital-loss',inplace = True, axis=1)
test_data.drop('hours-per-week',inplace = True, axis=1)

print("Training dataset columns after continuous atribute deletion")
print(training_data.columns)
print("Test dataset columns after continuous atribute deletion")
print(test_data.columns)


Training dataset columns before continuous attribute deletion
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')
Test dataset columns before continuous attribute deletion
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')
Training dataset columns after continuous atribute deletion
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'class'],
      dtype='object')
Test dataset columns after continuous atribute deletion
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'nativ

Step 4: Use one-hot encoding to transform data on each multi-domain categorial attribute
- Using scikit-learn to one-hot encode all the categorical data into numerical data so that it can be used in the algorithms for the next steps
- doing this on the training data

In [67]:
#printing og head
print(training_data.head())

#getting data typed
training_data['workclass'] = training_data['workclass'].astype('category')
training_data['education'] = training_data['education'].astype('category')
training_data['marital-status'] = training_data['marital-status'].astype('category')
training_data['occupation'] = training_data['occupation'].astype('category')
training_data['relationship'] = training_data['relationship'].astype('category')
training_data['race'] = training_data['race'].astype('category')
training_data['sex'] = training_data['sex'].astype('category')
training_data['native-country'] = training_data['native-country'].astype('category')
training_data['class'] = training_data['class'].astype('category')

#getting category data codes
training_data['workclass_new'] = training_data['workclass'].cat.codes
training_data['education_new'] = training_data['education'].cat.codes
training_data['marital-status_new'] = training_data['marital-status'].cat.codes
training_data['occupation_new'] = training_data['occupation'].cat.codes
training_data['relationship_new'] = training_data['relationship'].cat.codes
training_data['race_new'] = training_data['race'].cat.codes
training_data['sex_new'] = training_data['sex'].cat.codes
training_data['native-country_new'] = training_data['native-country'].cat.codes
training_data['class_new'] = training_data['class'].cat.codes

#creating one-hot encoder
enc = OneHotEncoder()

#apending codified data to the data frame
enc_data = pd.DataFrame(enc.fit_transform(training_data[['workclass_new', 'education_new', 'marital-status_new', 'occupation_new', 'relationship_new', 'race_new', 'sex_new', 'native-country_new', 'class_new']]).toarray())
training_data_new = training_data.join(enc_data)

#removing old version of data
training_data.drop('workclass',inplace = True, axis=1)
training_data.drop('education',inplace = True, axis=1)
training_data.drop('marital-status',inplace = True, axis=1)
training_data.drop('occupation',inplace = True, axis=1)
training_data.drop('relationship',inplace = True, axis=1)
training_data.drop('race',inplace = True, axis=1)
training_data.drop('sex',inplace = True, axis=1)
training_data.drop('native-country',inplace = True, axis=1)
training_data.drop('class',inplace = True, axis=1)

#printing new data head with categorical data transformed into numerical data
print(training_data.head())

           workclass   education       marital-status          occupation  \
0          State-gov   Bachelors        Never-married        Adm-clerical   
1   Self-emp-not-inc   Bachelors   Married-civ-spouse     Exec-managerial   
2            Private     HS-grad             Divorced   Handlers-cleaners   
3            Private        11th   Married-civ-spouse   Handlers-cleaners   
4            Private   Bachelors   Married-civ-spouse      Prof-specialty   

     relationship    race      sex  native-country   class  
0   Not-in-family   White     Male   United-States   <=50K  
1         Husband   White     Male   United-States   <=50K  
2   Not-in-family   White     Male   United-States   <=50K  
3         Husband   Black     Male   United-States   <=50K  
4            Wife   Black   Female            Cuba   <=50K  
   workclass_new  education_new  marital-status_new  occupation_new  \
0              5              9                   4               0   
1              4             

Step 4 Part 2:
- Repeat step 4 on the test_data

In [68]:
#printing og head
print(test_data.head())

#getting data typed
test_data['workclass'] = test_data['workclass'].astype('category')
test_data['education'] = test_data['education'].astype('category')
test_data['marital-status'] = test_data['marital-status'].astype('category')
test_data['occupation'] = test_data['occupation'].astype('category')
test_data['relationship'] = test_data['relationship'].astype('category')
test_data['race'] = test_data['race'].astype('category')
test_data['sex'] = test_data['sex'].astype('category')
test_data['native-country'] = test_data['native-country'].astype('category')
test_data['class'] = test_data['class'].astype('category')

#getting category data codes
test_data['workclass_new'] = test_data['workclass'].cat.codes
test_data['education_new'] = test_data['education'].cat.codes
test_data['marital-status_new'] = test_data['marital-status'].cat.codes
test_data['occupation_new'] = test_data['occupation'].cat.codes
test_data['relationship_new'] = test_data['relationship'].cat.codes
test_data['race_new'] = test_data['race'].cat.codes
test_data['sex_new'] = test_data['sex'].cat.codes
test_data['native-country_new'] = test_data['native-country'].cat.codes
test_data['class_new'] = test_data['class'].cat.codes

#creating one-hot encoder
enc = OneHotEncoder()

#apending codified data to the data frame
enc_data = pd.DataFrame(enc.fit_transform(test_data[['workclass_new', 'education_new', 'marital-status_new', 'occupation_new', 'relationship_new', 'race_new', 'sex_new', 'native-country_new', 'class_new']]).toarray())
test_data_new = test_data.join(enc_data)

#removing old version of data
test_data.drop('workclass',inplace = True, axis=1)
test_data.drop('education',inplace = True, axis=1)
test_data.drop('marital-status',inplace = True, axis=1)
test_data.drop('occupation',inplace = True, axis=1)
test_data.drop('relationship',inplace = True, axis=1)
test_data.drop('race',inplace = True, axis=1)
test_data.drop('sex',inplace = True, axis=1)
test_data.drop('native-country',inplace = True, axis=1)
test_data.drop('class',inplace = True, axis=1)

#printing new data head with categorical data transformed into numerical data
print(test_data.head())

    workclass      education       marital-status          occupation  \
0     Private           11th        Never-married   Machine-op-inspct   
1     Private        HS-grad   Married-civ-spouse     Farming-fishing   
2   Local-gov     Assoc-acdm   Married-civ-spouse     Protective-serv   
3     Private   Some-college   Married-civ-spouse   Machine-op-inspct   
5     Private           10th        Never-married       Other-service   

     relationship    race    sex  native-country    class  
0       Own-child   Black   Male   United-States   <=50K.  
1         Husband   White   Male   United-States   <=50K.  
2         Husband   White   Male   United-States    >50K.  
3         Husband   Black   Male   United-States    >50K.  
5   Not-in-family   White   Male   United-States   <=50K.  
   workclass_new  education_new  marital-status_new  occupation_new  \
0              2              1                   4               6   
1              2             11                   2        