1. Take a dataset
2. Split it into 80:20 ratio
3. Find missing values in X_train(both numeric and categorical)
4. Remove those rows which ahs missing values from X_train
5. Drop rows from y_train for same indices of X_train
6. Perform label encoding 
7. Train the model
8. Test the model for both dirty and clean data
9. Check the accuracy and fairness

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
population = pd.read_csv(r"C:\Users\sathv\OneDrive\Desktop\RDM\adult_csv.csv")

In [3]:
population.info()
population.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capitalgain     48842 non-null  int64 
 11  capitalloss     48842 non-null  int64 
 12  hoursperweek    48842 non-null  int64 
 13  native-country  47985 non-null  object
 14  class           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


Classified the training features and predicted feature

In [4]:
sample_1 = population

In [5]:
X= sample_1[["age","workclass","education-num","marital-status","occupation","relationship","race","sex","capitalgain","capitalloss","hoursperweek","native-country"]]
y = sample_1["class"]

Split the dataset into 70:20 ratio

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [8]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

# Functions to get the columns by type,checking if there are any missing values in the row and deleting those rows.

In [9]:
def variable(data,x):
    cols = data.select_dtypes([x]).columns
    missing_values = []
    for i in range(len(cols)):
        if data[cols[i]].isnull().values.any():
            missing_values.append(cols[i])
    return(missing_values,list(cols))
    

Checking for X_train for type numeric

In [10]:
missing_values, cols = variable(X_train, np.number)

In [11]:
missing_values

[]

Checking for X_train for type category

In [12]:
missing_values, cat_cols = variable(X_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_cols = variable(X_train, np.object)


In [13]:
missing_values

['workclass', 'occupation', 'native-country']

In [14]:
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [15]:
X_train[missing_values].mode() 

Unnamed: 0,workclass,occupation,native-country
0,Private,Prof-specialty,United-States


In [16]:
X_train[missing_values] = X_train[missing_values].fillna(X_train[missing_values].mode().iloc[0])

In [17]:
X_train.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [18]:
len(X_train)

39073

In [19]:
y_train.isnull().sum()

0

In [20]:
len(y_train)

39073

Convert categorical variables to numerical variables 
1. One-Hot Encoding

# One-Hot encoding for X_train

In [21]:
X_train = pd.get_dummies(X_train, columns = ['workclass', 'relationship', 'marital-status', 'occupation', 'race', 'sex', 'native-country'])

In [22]:
X_train.head()

Unnamed: 0,age,education-num,capitalgain,capitalloss,hoursperweek,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
37193,1,9,0,0,2,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
31093,2,14,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
33814,1,8,0,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14500,4,9,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
23399,4,9,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# One-Hot encoding for y_train

In [23]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

In [24]:
y_train = y_train.astype('category')

In [25]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: category
Categories (2, object): ['<=50K', '>50K']

# One-Hot encoding for X_test

In [26]:
X_test = pd.get_dummies(X_test, columns = ['workclass', 'relationship', 'marital-status', 'occupation', 'race', 'sex', 'native-country'])

In [27]:
X_test

Unnamed: 0,age,education-num,capitalgain,capitalloss,hoursperweek,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
7762,0,9,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
23881,0,8,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
30507,0,9,0,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
28911,0,10,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
19484,3,9,0,0,3,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,14,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
18798,0,9,0,0,2,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
29519,2,10,0,0,3,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
550,3,10,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
X_test = X_test.drop(['native-country_Holand-Netherlands'], axis=1)

# One-Hot encoding for y_test

In [29]:
y_test = y_test.astype('category')

In [30]:
y_test

7762     <=50K
23881    <=50K
30507    <=50K
28911    <=50K
19484    <=50K
         ...  
43046    <=50K
18798    <=50K
29519    <=50K
550      <=50K
14337     >50K
Name: class, Length: 9769, dtype: category
Categories (2, object): ['<=50K', '>50K']

In [31]:
X_train.isnull().sum()

age                               0
education-num                     0
capitalgain                       0
capitalloss                       0
hoursperweek                      0
                                 ..
native-country_Thailand           0
native-country_Trinadad&Tobago    0
native-country_United-States      0
native-country_Vietnam            0
native-country_Yugoslavia         0
Length: 87, dtype: int64

# Train the model with train data

In [32]:
def logistic_regression_train(logisticRegr,x,y):
    logisticRegr.fit(x,y)
def logistic_regression_test(logisticRegr,x):
    y_pred = logisticRegr.predict_proba(x)
    return y_pred
def logistic_regression_pred_test(logisticRegr,x):
    y_pred_test = logisticRegr.predict(x)
    return y_pred_test

def average_diff(y_pred, X_test):
    column_i = y_pred.T[0]
    X_test_copy = X_test.copy()
    X_test_copy['preds'] = column_i
    Female = X_test_copy["preds"][X_test_copy["sex_Female"]== 1]
    female_average = sum(Female)/len(Female)
    male = X_test_copy["preds"][X_test_copy["sex_Male"] == 1]
    male_average = sum(male)/len(male)
    diff = (female_average - male_average)
    return diff 

def accuracy(y_test, y_pred_final):
#     y_pred_final = np.argmax(y_pred, axis=1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_final).ravel()
    true_positive  = tn
    false_positive = fn
    true_negative  = tp
    false_negative = fp
    accuracy = (true_positive + true_negative) / (true_negative + true_positive + false_positive + false_negative )
    return(accuracy)
    
    

In [33]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict_proba(X_test)

dirty_diff = average_diff(y_pred,X_test)
dirty_diff

y_pred_test = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score
dirty_test_score = accuracy_score(y_test,y_pred_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# Testing on clean test data

In [34]:
X_clean_train, X_clean_test, y_clean_train, y_clean_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [35]:
X_clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [36]:
y_clean_train.isnull().sum()

0

In [37]:
X_clean_test.isnull().sum()

age                 0
workclass         577
education-num       0
marital-status      0
occupation        578
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
native-country    180
dtype: int64

In [38]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9192 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9191 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9589 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [39]:
y_clean_test.isnull().sum()

0

In [40]:
missing_values, cat_clean_cols = variable(X_clean_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_clean_cols = variable(X_clean_train, np.object)


In [41]:
missing_values

['workclass', 'occupation', 'native-country']

In [42]:
X_clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [43]:
X_clean_train[missing_values] = X_clean_train[missing_values].fillna(X_clean_train[missing_values].mode().iloc[0])

In [44]:
X_clean_train

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,Private,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States
31093,2,State-gov,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States
33814,1,Local-gov,8,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States
14500,4,Private,9,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States
23399,4,Self-emp-not-inc,9,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,Private,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,2,3,United-States
44732,0,Private,9,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,2,United-States
38158,0,Private,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,2,United-States
860,0,Private,7,Never-married,Adm-clerical,Own-child,White,Female,0,0,0,United-States


In [45]:
y_clean_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

In [46]:
missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)


In [47]:
missing_values_test

['workclass', 'occupation', 'native-country']

In [48]:
X_clean_test[missing_values_test] = X_clean_test[missing_values_test].fillna(X_clean_test[missing_values_test].mode().iloc[0])

In [49]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9769 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9769 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9769 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [50]:
y_clean_test

7762     <=50K
23881    <=50K
30507    <=50K
28911    <=50K
19484    <=50K
         ...  
43046    <=50K
18798    <=50K
29519    <=50K
550      <=50K
14337     >50K
Name: class, Length: 9769, dtype: object

In [51]:
X_clean_train = pd.get_dummies(X_clean_train, columns = ['workclass', 'relationship', 'marital-status', 'occupation', 'race', 'sex', 'native-country'])

In [52]:
y_clean_train = y_clean_train.astype('category')

In [53]:
X_clean_test = pd.get_dummies(X_clean_test, columns = ['workclass', 'relationship', 'marital-status', 'occupation', 'race', 'sex', 'native-country'])

In [54]:
X_clean_test = X_clean_test.drop(['native-country_Holand-Netherlands'], axis=1)

In [55]:
y_clean_test = y_clean_test.astype('category')

In [56]:
clean_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
clean_model.fit(X_clean_train, y_clean_train)

y_clean_pred = classifier.predict_proba(X_clean_test)

clean_diff = average_diff(y_clean_pred,X_clean_test)
clean_diff

y_pred_clean_test = classifier.predict(X_test)

clean_test_score = accuracy_score(y_test,y_pred_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [57]:
from tabulate import tabulate
data = [["Impute Missing Values (Mode)","Dirty test data","One-Hot Encoding", "KNN", dirty_test_score, dirty_diff],
["Impute Missing Values (Mode) ", "Clean test data", "One-Hot Encoding", "KNN", clean_test_score, clean_diff]]
print (tabulate(data, headers=["Method", "Test Data", "Encoding Technique", "ML Model", "Accuracy", "Fairness"]))

Method                        Test Data        Encoding Technique    ML Model      Accuracy    Fairness
----------------------------  ---------------  --------------------  ----------  ----------  ----------
Impute Missing Values (Mode)  Dirty test data  One-Hot Encoding      KNN           0.835602    0.200903
Impute Missing Values (Mode)  Clean test data  One-Hot Encoding      KNN           0.835602    0.20023
