1. Take a dataset
2. Split it into 80:20 ratio
3. Find missing values in X_train(both numeric and categorical)
4. Remove those rows which ahs missing values from X_train
5. Drop rows from y_train for same indices of X_train
6. Perform label encoding 
7. Train the model
8. Test the model for both dirty and clean data
9. Check the accuracy and fairness

In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [94]:
population = pd.read_csv(r"C:\Users\sathv\OneDrive\Desktop\RDM\adult_csv.csv")

In [95]:
population.info()
population.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capitalgain     48842 non-null  int64 
 11  capitalloss     48842 non-null  int64 
 12  hoursperweek    48842 non-null  int64 
 13  native-country  47985 non-null  object
 14  class           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


Classified the training features and predicted feature

In [96]:
sample_1 = population

In [97]:
X= sample_1[["age","workclass","education-num","marital-status","occupation","relationship","race","sex","capitalgain","capitalloss","hoursperweek","native-country"]]
y = sample_1["class"]

Split the dataset into 70:20 ratio

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [99]:
X_train_dirty = X_train
X_test_dirty = X_test
y_train_dirty = y_train
y_test_dirty = y_test

In [100]:
X_test_dirty.isnull().sum()

age                 0
workclass         577
education-num       0
marital-status      0
occupation        578
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
native-country    180
dtype: int64

In [101]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [102]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

# Functions to get the columns by type,checking if there are any missing values in the row and deleting those rows.

In [103]:
def variable(data,x):
    cols = data.select_dtypes([x]).columns
    missing_values = []
    for i in range(len(cols)):
        if data[cols[i]].isnull().values.any():
            missing_values.append(cols[i])
    return(missing_values,list(cols))
    

In [104]:
def remove(data):
    new_data = data.dropna()
    only_na = data[~data.index.isin(new_data.index)]
    only_na = list(only_na.index)
    return(new_data,list(only_na))

Checking for X_train for type numeric

In [105]:
missing_values, cols = variable(X_train, np.number)

In [106]:
missing_values

[]

Checking for X_train for type category

In [107]:
missing_values, cat_cols = variable(X_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_cols = variable(X_train, np.object)


In [108]:
missing_values

['workclass', 'occupation', 'native-country']

In [109]:
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [110]:
X_train,only_na = remove(X_train)

In [111]:
X_train

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,Private,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States
31093,2,State-gov,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States
33814,1,Local-gov,8,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States
14500,4,Private,9,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States
23399,4,Self-emp-not-inc,9,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
6265,4,Self-emp-not-inc,4,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,1,United-States
44732,0,Private,9,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,2,United-States
38158,0,Private,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,2,United-States
860,0,Private,7,Never-married,Adm-clerical,Own-child,White,Female,0,0,0,United-States


# Drop rows from y_train to match the number of rows in both X_train and Y_train

In [112]:
def drop_rows(x,only_na):
    x = x.drop(index=only_na)
    return x  

In [113]:
y_train = drop_rows(y_train,only_na)

In [114]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
6265     <=50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 36199, dtype: object

Convert categorical variables to numerical variables 
1. Label Encoding

In [115]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [116]:
def label_encoding(data,cat_cols):
    if cat_cols is None:
        data= label_encoder.fit_transform(data)
        return data
        
    else:
        for i in range(len(cat_cols)):
            data[cat_cols[i]]= label_encoder.fit_transform(data[cat_cols[i]])
        

# Label encoding for X_train

In [117]:
label_encoding(X_train,cat_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[cat_cols[i]]= label_encoder.fit_transform(data[cat_cols[i]])


In [118]:
X_train

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,2,9,2,13,0,4,1,0,0,2,37
31093,2,5,14,2,9,0,4,1,0,0,2,37
33814,1,1,8,2,4,0,4,1,0,0,2,37
14500,4,2,9,0,2,4,4,1,0,0,0,37
23399,4,4,9,6,11,1,4,1,0,0,4,37
...,...,...,...,...,...,...,...,...,...,...,...,...
6265,4,4,4,2,4,0,4,1,0,0,1,37
44732,0,2,9,4,6,3,4,0,0,0,2,37
38158,0,2,9,0,0,1,4,0,0,0,2,37
860,0,2,7,4,0,3,4,0,0,0,0,37


In [119]:
X_train["sex"]

37193    1
31093    1
33814    1
14500    1
23399    1
        ..
6265     1
44732    0
38158    0
860      0
15795    1
Name: sex, Length: 36199, dtype: int32

# Label encoding for y_train

In [120]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
6265     <=50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 36199, dtype: object

In [121]:
# col = None
y_train = label_encoding(y_train, None)

In [122]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

# Label encoding for X_test

In [123]:
label_encoding(X_test, cat_cols)

In [124]:
X_test

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
7762,0,3,9,4,7,1,4,1,0,0,0,38
23881,0,3,8,4,11,3,4,0,0,0,0,38
30507,0,1,9,4,5,2,2,1,0,0,2,38
28911,0,3,10,4,11,3,4,0,0,0,1,38
19484,3,3,9,4,6,4,0,1,0,0,3,32
...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,5,14,5,11,4,4,0,0,0,2,38
18798,0,3,9,4,7,3,4,1,0,0,2,38
29519,2,3,10,0,13,1,2,1,0,0,3,38
550,3,5,10,0,3,1,4,1,0,0,3,38


# Label encoding for y_test

In [125]:
y_test= label_encoding(y_test, None)

In [126]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

# Train the model with train data

In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
logisticRegr = LogisticRegression()

In [128]:
def logistic_regression_train(logisticRegr,x,y):
    logisticRegr.fit(x,y)
def logistic_regression_test(logisticRegr,x):
    y_pred = logisticRegr.predict_proba(x)
    return y_pred
def logistic_regression_pred_test(logisticRegr,x):
    y_pred_test = logisticRegr.predict(x)
    return y_pred_test

def average_diff(y_pred, X_test):
    column_i = y_pred.T[0]
    X_test_copy = X_test.copy()
    X_test_copy['preds'] = column_i
    Female = X_test_copy["preds"][X_test_copy["sex"] == 0]
    female_average = sum(Female)/len(Female)
    male = X_test_copy["preds"][X_test_copy["sex"] == 1]
    male_average = sum(male)/len(male)
    diff = (female_average - male_average)
    return diff 

def accuracy(y_test, y_pred_final):
#     y_pred_final = np.argmax(y_pred, axis=1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_final).ravel()
    true_positive  = tn
    false_positive = fn
    true_negative  = tp
    false_negative = fp
    accuracy = (true_positive + true_negative) / (true_negative + true_positive + false_positive + false_negative )
    return(accuracy)
    
    

# Training the model

In [129]:
model = logistic_regression_train(logisticRegr, X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Check predictions on dirty test data

In [130]:
y_pred = logistic_regression_test(logisticRegr, X_test)

In [131]:
y_pred

array([[0.97917155, 0.02082845],
       [0.99542328, 0.00457672],
       [0.95353853, 0.04646147],
       ...,
       [0.67261535, 0.32738465],
       [0.59927653, 0.40072347],
       [0.83370887, 0.16629113]])

In [132]:
dirty_diff = average_diff(y_pred, X_test)

In [133]:
dirty_diff

0.1882626655935421

# Checking accuracy of the model

In [134]:
y_pred_test = logistic_regression_pred_test(logisticRegr, X_test)

In [135]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [136]:
accur = accuracy(y_test, y_pred_test)

In [137]:
accur

0.8264919643771113

In [138]:
dirty_test_score = logisticRegr.score(X_test, y_test)

In [139]:
dirty_test_score

0.8264919643771113

# Testing on clean test data

In [140]:
X_clean_train, X_clean_test, y_clean_train, y_clean_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [141]:
X_clean_train.isnull().sum()

age                  0
workclass         2222
education-num        0
marital-status       0
occupation        2231
relationship         0
race                 0
sex                  0
capitalgain          0
capitalloss          0
hoursperweek         0
native-country     677
dtype: int64

In [142]:
y_clean_train.isnull().sum()

0

In [143]:
X_clean_test.isnull().sum()

age                 0
workclass         577
education-num       0
marital-status      0
occupation        578
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
native-country    180
dtype: int64

In [144]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9192 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9191 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9589 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [145]:
y_clean_test.isnull().sum()

0

In [146]:
missing_values, cat_clean_cols = variable(X_clean_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_clean_cols = variable(X_clean_train, np.object)


In [147]:
missing_values

['workclass', 'occupation', 'native-country']

In [148]:
X_clean_train,only_na = remove(X_clean_train)

In [149]:
X_clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36199 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             36199 non-null  int64 
 1   workclass       36199 non-null  object
 2   education-num   36199 non-null  int64 
 3   marital-status  36199 non-null  object
 4   occupation      36199 non-null  object
 5   relationship    36199 non-null  object
 6   race            36199 non-null  object
 7   sex             36199 non-null  object
 8   capitalgain     36199 non-null  int64 
 9   capitalloss     36199 non-null  int64 
 10  hoursperweek    36199 non-null  int64 
 11  native-country  36199 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.6+ MB


In [150]:
def drop_rows(x,only_na):
    x = x.drop(index=only_na)
    return x  

In [151]:
y_clean_train = drop_rows(y_clean_train,only_na)

In [152]:
y_clean_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
6265     <=50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 36199, dtype: object

In [153]:
missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)


In [154]:
missing_values_test

['workclass', 'occupation', 'native-country']

In [155]:
# X_clean_test,only_na_test = remove(X_clean_test)

In [156]:
X_clean_test.index

Int64Index([ 7762, 23881, 30507, 28911, 19484, 43031, 28188, 12761, 40834,
            27875,
            ...
            42704,  9137, 29925, 19791,  4178, 43046, 18798, 29519,   550,
            14337],
           dtype='int64', length=9769)

In [157]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9192 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9191 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9589 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [158]:
new_data = X_clean_test.dropna()

In [159]:
new_data.index

Int64Index([ 7762, 23881, 30507, 28911, 19484, 43031, 28188, 12761, 40834,
            27875,
            ...
            42704,  9137, 29925, 19791,  4178, 43046, 18798, 29519,   550,
            14337],
           dtype='int64', length=9023)

In [160]:
only_na_test = X_clean_test[~X_clean_test.index.isin(new_data.index)]

In [161]:
only_na_test = list(only_na_test.index)

In [162]:
only_na_test

[36230,
 2372,
 24735,
 31220,
 20757,
 8871,
 2210,
 16399,
 43069,
 8225,
 25056,
 5803,
 42296,
 38656,
 13182,
 42906,
 21812,
 25726,
 3211,
 27522,
 38296,
 25588,
 35339,
 26800,
 40602,
 4215,
 32310,
 7096,
 29436,
 32213,
 4519,
 12936,
 27019,
 23918,
 40017,
 34154,
 38399,
 27133,
 33802,
 41823,
 25458,
 13370,
 8853,
 10342,
 1175,
 29981,
 6178,
 48118,
 16810,
 22910,
 12004,
 47895,
 10126,
 31101,
 5921,
 40845,
 46491,
 14203,
 13897,
 1900,
 43036,
 8297,
 18036,
 25303,
 30170,
 35463,
 1282,
 11734,
 21531,
 37459,
 42128,
 22281,
 13747,
 12627,
 45208,
 42925,
 4745,
 13095,
 18924,
 19788,
 4982,
 11300,
 3402,
 47988,
 26197,
 31605,
 18466,
 7284,
 47217,
 48357,
 47885,
 14441,
 33599,
 19490,
 40676,
 1704,
 22983,
 17537,
 32073,
 34392,
 17644,
 33201,
 41978,
 46785,
 34653,
 46545,
 9293,
 31790,
 47006,
 30042,
 32974,
 29662,
 41698,
 44309,
 725,
 18010,
 41314,
 14363,
 35526,
 10539,
 2213,
 484,
 38533,
 4884,
 12858,
 47226,
 1570,
 8322,
 27844

In [163]:
X_clean_test = new_data

In [164]:
y_clean_test = drop_rows(y_clean_test,only_na_test)

In [165]:
y_clean_test

7762     <=50K
23881    <=50K
30507    <=50K
28911    <=50K
19484    <=50K
         ...  
43046    <=50K
18798    <=50K
29519    <=50K
550      <=50K
14337     >50K
Name: class, Length: 9023, dtype: object

In [166]:
label_encoding(X_clean_train,cat_clean_cols)

In [167]:
X_clean_train.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [168]:
y_clean_train = label_encoding(y_clean_train, None)

In [169]:
y_clean_train

array([0, 1, 0, ..., 0, 0, 0])

In [170]:
label_encoding(X_clean_test, cat_clean_cols_test)

In [171]:
X_clean_test

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
7762,0,2,9,4,7,1,4,1,0,0,0,38
23881,0,2,8,4,11,3,4,0,0,0,0,38
30507,0,1,9,4,5,2,2,1,0,0,2,38
28911,0,2,10,4,11,3,4,0,0,0,1,38
19484,3,2,9,4,6,4,0,1,0,0,3,32
...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,4,14,5,11,4,4,0,0,0,2,38
18798,0,2,9,4,7,3,4,1,0,0,2,38
29519,2,2,10,0,13,1,2,1,0,0,3,38
550,3,4,10,0,3,1,4,1,0,0,3,38


In [172]:
y_clean_test= label_encoding(y_clean_test, None)

In [173]:
y_clean_test

array([0, 0, 0, ..., 0, 0, 1])

In [174]:
clean_model = logistic_regression_train(logisticRegr, X_clean_train, y_clean_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [175]:
y_clean_pred = logistic_regression_test(logisticRegr, X_clean_test)

In [176]:
X_clean_test.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [177]:
y_clean_pred

array([[0.97633601, 0.02366399],
       [0.99478844, 0.00521156],
       [0.95353853, 0.04646147],
       ...,
       [0.64325112, 0.35674888],
       [0.56756342, 0.43243658],
       [0.81481556, 0.18518444]])

In [178]:
clean_diff = average_diff(y_clean_pred, X_clean_test)

In [179]:
clean_diff

0.2013134887208624

In [180]:
clean_test_score = logisticRegr.score(X_clean_test, y_clean_test)

In [181]:
clean_test_score

0.8204588274409842

In [182]:
from tabulate import tabulate
data = [["Removing Missing Values","Dirty test data","Label Encoding", "Logistic Regression", dirty_test_score, dirty_diff],
["Removing Missing Values", "Clean test data", "Label Encoding", "Logistic Regression", clean_test_score, clean_diff]]
print (tabulate(data, headers=["Method", "Test Data", "Encoding Technique", "ML Model", "Accuracy", "Fairness"]))

Method                   Test Data        Encoding Technique    ML Model               Accuracy    Fairness
-----------------------  ---------------  --------------------  -------------------  ----------  ----------
Removing Missing Values  Dirty test data  Label Encoding        Logistic Regression    0.826492    0.188263
Removing Missing Values  Clean test data  Label Encoding        Logistic Regression    0.820459    0.201313
