In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [22]:
train = pd.read_csv('train_final.csv')
test = pd.read_csv('test_final.csv')

In [23]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0


In [24]:
# data cleaning
# drop education column
train.drop('education', inplace=True, axis=1)
test.drop('education', inplace=True, axis=1)
train.columns.tolist()

['age',
 'workclass',
 'fnlwgt',
 'education.num',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital.gain',
 'capital.loss',
 'hours.per.week',
 'native.country',
 'income>50K']

In [25]:
# check for nulls
train.isna().sum()

age               0
workclass         0
fnlwgt            0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income>50K        0
dtype: int64

In [26]:
# check for duplicates
print("Train data:")
print("Before removing duplicates:", train.duplicated().sum())

train = train[~train.duplicated()]

print("After removing duplicates:", train.duplicated().sum())

print("Test data:")
print("Before removing duplicates:", test.duplicated().sum())

test = test[~test.duplicated()]

print("After removing duplicates:", test.duplicated().sum())



Train data:
Before removing duplicates: 14
After removing duplicates: 0
Test data:
Before removing duplicates: 0
After removing duplicates: 0


In [28]:
columns = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for column in columns:
    train[column] = train[column].str.strip()
    test[column] = test[column].str.strip()

In [32]:
train['workclass'].value_counts()

Private             16570
Self-emp-not-inc     1884
Local-gov            1519
?                    1362
State-gov             944
Self-emp-inc          850
Federal-gov           702
Without-pay             6
Never-worked            5
Name: workclass, dtype: int64

In [30]:
train['marital.status'].value_counts()

Married-civ-spouse       11438
Never-married             8218
Divorced                  3411
Widowed                    837
Separated                  729
Married-spouse-absent      326
Married-AF-spouse           27
Name: marital.status, dtype: int64

In [31]:
train['occupation'].value_counts()

Prof-specialty       3202
Exec-managerial      3171
Craft-repair         3144
Adm-clerical         2826
Sales                2727
Other-service        2546
Machine-op-inspct    1580
?                    1441
Transport-moving     1192
Handlers-cleaners    1047
Tech-support          743
Farming-fishing       733
Protective-serv       506
Priv-house-serv       118
Armed-Forces           10
Name: occupation, dtype: int64

In [33]:
test['workclass'].value_counts()

Private             16570
Self-emp-not-inc     1884
Local-gov            1519
?                    1362
State-gov             944
Self-emp-inc          850
Federal-gov           702
Without-pay             6
Never-worked            5
Name: workclass, dtype: int64

In [34]:
# change '?' to Unknown 
columns = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for column in columns:
    train[column] = train[column].replace({'?': 'Unknown'})
    test[column] = test[column].replace({'?': 'Unknown'})

In [None]:
# Feature exploration
