1. Take a dataset
2. Split it into 80:20 ratio
3. Find missing values in X_train(both numeric and categorical)
4. Remove those rows which ahs missing values from X_train
5. Drop rows from y_train for same indices of X_train
6. Perform label encoding 
7. Train the model
8. Test the model for both dirty and clean data
9. Check the accuracy and fairness

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
population = pd.read_csv(r"C:\Users\sathv\OneDrive\Desktop\RDM\adult_csv.csv")

In [13]:
population.info()
population.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capitalgain     48842 non-null  int64 
 11  capitalloss     48842 non-null  int64 
 12  hoursperweek    48842 non-null  int64 
 13  native-country  47985 non-null  object
 14  class           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


Classified the training features and predicted feature

In [14]:
sample_1 = population

In [15]:
X= sample_1[["age","workclass","education-num","marital-status","occupation","relationship","race","sex","capitalgain","capitalloss","hoursperweek","native-country"]]
y = sample_1["class"]

Split the dataset into 70:20 ratio

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [18]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

# Functions to get the columns by type,checking if there are any missing values in the row and deleting those rows.

In [19]:
def variable(data,x):
    cols = data.select_dtypes([x]).columns
    missing_values = []
    for i in range(len(cols)):
        if data[cols[i]].isnull().values.any():
            missing_values.append(cols[i])
    return(missing_values,list(cols))
    

Checking for X_train for type numeric

In [20]:
missing_values, cols = variable(X_train, np.number)

In [21]:
missing_values

[]

Checking for X_train for type category

In [22]:
missing_values, cat_cols = variable(X_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_cols = variable(X_train, np.object)


In [23]:
missing_values

['workclass', 'occupation', 'native-country']

In [24]:
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [25]:
X_train[missing_values].mode() 

Unnamed: 0,workclass,occupation,native-country
0,Private,Prof-specialty,United-States


In [26]:
X_train[missing_values] = X_train[missing_values].fillna(X_train[missing_values].mode().iloc[0])

In [27]:
X_train.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [28]:
len(X_train)

39073

In [29]:
y_train.isnull().sum()

0

In [30]:
len(y_train)

39073

Convert categorical variables to numerical variables 
1. Label Encoding

In [31]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [32]:
def label_encoding(data,cat_cols):
    if cat_cols is None:
        data= label_encoder.fit_transform(data)
        return data
        
    else:
        for i in range(len(cat_cols)):
            data[cat_cols[i]]= label_encoder.fit_transform(data[cat_cols[i]])
        

# Label encoding for X_train

In [33]:
label_encoding(X_train,cat_cols)

In [34]:
X_train.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,3,9,2,13,0,4,1,0,0,2,37
31093,2,6,14,2,9,0,4,1,0,0,2,37
33814,1,1,8,2,4,0,4,1,0,0,2,37
14500,4,3,9,0,2,4,4,1,0,0,0,37
23399,4,5,9,6,11,1,4,1,0,0,4,37


# Label encoding for y_train

In [35]:
y_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

In [36]:
# col = None
y_train = label_encoding(y_train, None)

In [37]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

# Label encoding for X_test

In [38]:
label_encoding(X_test, cat_cols)

In [39]:
X_test

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
7762,0,3,9,4,7,1,4,1,0,0,0,38
23881,0,3,8,4,11,3,4,0,0,0,0,38
30507,0,1,9,4,5,2,2,1,0,0,2,38
28911,0,3,10,4,11,3,4,0,0,0,1,38
19484,3,3,9,4,6,4,0,1,0,0,3,32
...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,5,14,5,11,4,4,0,0,0,2,38
18798,0,3,9,4,7,3,4,1,0,0,2,38
29519,2,3,10,0,13,1,2,1,0,0,3,38
550,3,5,10,0,3,1,4,1,0,0,3,38


# Label encoding for y_test

In [40]:
y_test= label_encoding(y_test, None)

In [41]:
y_test

array([0, 0, 0, ..., 0, 0, 1])

In [42]:
X_train.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

# Train the model with train data

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
logisticRegr = LogisticRegression()

In [44]:
def logistic_regression_train(logisticRegr,x,y):
    logisticRegr.fit(x,y)
def logistic_regression_test(logisticRegr,x):
    y_pred = logisticRegr.predict_proba(x)
    return y_pred
def logistic_regression_pred_test(logisticRegr,x):
    y_pred_test = logisticRegr.predict(x)
    return y_pred_test

def average_diff(y_pred, X_test):
    column_i = y_pred.T[0]
    X_test_copy = X_test.copy()
    X_test_copy['preds'] = column_i
    Female = X_test_copy["preds"][X_test_copy["sex"] == 0]
    female_average = sum(Female)/len(Female)
    male = X_test_copy["preds"][X_test_copy["sex"] == 1]
    male_average = sum(male)/len(male)
    diff = (female_average - male_average)
    return diff 

def accuracy(y_test, y_pred_final):
#     y_pred_final = np.argmax(y_pred, axis=1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_final).ravel()
    true_positive  = tn
    false_positive = fn
    true_negative  = tp
    false_negative = fp
    accuracy = (true_positive + true_negative) / (true_negative + true_positive + false_positive + false_negative )
    return(accuracy)
    
    

# Training the model

In [45]:
model = logistic_regression_train(logisticRegr, X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Check predictions on dirty test data

In [46]:
y_pred = logistic_regression_test(logisticRegr, X_test)

In [47]:
y_pred

array([[0.97632446, 0.02367554],
       [0.99607419, 0.00392581],
       [0.93682327, 0.06317673],
       ...,
       [0.6065507 , 0.3934493 ],
       [0.53420756, 0.46579244],
       [0.81041552, 0.18958448]])

In [48]:
dirty_diff = average_diff(y_pred, X_test)

In [49]:
dirty_diff

0.21581889991129055

# Checking accuracy of the model

In [50]:
y_pred_test = logistic_regression_pred_test(logisticRegr, X_test)

In [51]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [52]:
accur = accuracy(y_test, y_pred_test)

In [53]:
accur

0.8267990582454704

In [54]:
dirty_test_score = logisticRegr.score(X_test, y_test)

In [55]:
dirty_test_score

0.8267990582454704

# Testing on clean test data

In [56]:
X_clean_train, X_clean_test, y_clean_train, y_clean_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [57]:
X_clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [58]:
y_clean_train.isnull().sum()

0

In [59]:
X_clean_test.isnull().sum()

age                 0
workclass         577
education-num       0
marital-status      0
occupation        578
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
native-country    180
dtype: int64

In [60]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9192 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9191 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9589 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [61]:
y_clean_test.isnull().sum()

0

In [62]:
missing_values, cat_clean_cols = variable(X_clean_train, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values, cat_clean_cols = variable(X_clean_train, np.object)


In [63]:
missing_values

['workclass', 'occupation', 'native-country']

In [64]:
X_clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36851 non-null  object
 2   education-num   39073 non-null  int64 
 3   marital-status  39073 non-null  object
 4   occupation      36842 non-null  object
 5   relationship    39073 non-null  object
 6   race            39073 non-null  object
 7   sex             39073 non-null  object
 8   capitalgain     39073 non-null  int64 
 9   capitalloss     39073 non-null  int64 
 10  hoursperweek    39073 non-null  int64 
 11  native-country  38396 non-null  object
dtypes: int64(5), object(7)
memory usage: 3.9+ MB


In [65]:
X_clean_train[missing_values] = X_clean_train[missing_values].fillna(X_clean_train[missing_values].mode().iloc[0])

In [66]:
X_clean_train

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,Private,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States
31093,2,State-gov,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States
33814,1,Local-gov,8,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States
14500,4,Private,9,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States
23399,4,Self-emp-not-inc,9,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,Private,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,2,3,United-States
44732,0,Private,9,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,2,United-States
38158,0,Private,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,2,United-States
860,0,Private,7,Never-married,Adm-clerical,Own-child,White,Female,0,0,0,United-States


In [67]:
y_clean_train

37193    <=50K
31093     >50K
33814    <=50K
14500    <=50K
23399    <=50K
         ...  
11284     >50K
44732    <=50K
38158    <=50K
860      <=50K
15795    <=50K
Name: class, Length: 39073, dtype: object

In [68]:
missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  missing_values_test, cat_clean_cols_test = variable(X_clean_test, np.object)


In [69]:
missing_values_test

['workclass', 'occupation', 'native-country']

In [70]:
X_clean_test[missing_values_test] = X_clean_test[missing_values_test].fillna(X_clean_test[missing_values_test].mode().iloc[0])

In [71]:
X_clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 7762 to 14337
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9769 non-null   object
 2   education-num   9769 non-null   int64 
 3   marital-status  9769 non-null   object
 4   occupation      9769 non-null   object
 5   relationship    9769 non-null   object
 6   race            9769 non-null   object
 7   sex             9769 non-null   object
 8   capitalgain     9769 non-null   int64 
 9   capitalloss     9769 non-null   int64 
 10  hoursperweek    9769 non-null   int64 
 11  native-country  9769 non-null   object
dtypes: int64(5), object(7)
memory usage: 992.2+ KB


In [72]:
y_clean_test

7762     <=50K
23881    <=50K
30507    <=50K
28911    <=50K
19484    <=50K
         ...  
43046    <=50K
18798    <=50K
29519    <=50K
550      <=50K
14337     >50K
Name: class, Length: 9769, dtype: object

In [73]:
label_encoding(X_clean_train,cat_clean_cols)

In [74]:
X_clean_train.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [75]:
y_clean_train = label_encoding(y_clean_train, None)

In [76]:
y_clean_train

array([0, 1, 0, ..., 0, 0, 0])

In [77]:
label_encoding(X_clean_test, cat_clean_cols_test)

In [78]:
X_clean_test

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
7762,0,3,9,4,7,1,4,1,0,0,0,38
23881,0,3,8,4,11,3,4,0,0,0,0,38
30507,0,1,9,4,5,2,2,1,0,0,2,38
28911,0,3,10,4,11,3,4,0,0,0,1,38
19484,3,3,9,4,6,4,0,1,0,0,3,32
...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,5,14,5,11,4,4,0,0,0,2,38
18798,0,3,9,4,7,3,4,1,0,0,2,38
29519,2,3,10,0,13,1,2,1,0,0,3,38
550,3,5,10,0,3,1,4,1,0,0,3,38


In [79]:
y_clean_test= label_encoding(y_clean_test, None)

In [80]:
y_clean_test

array([0, 0, 0, ..., 0, 0, 1])

In [81]:
clean_model = logistic_regression_train(logisticRegr, X_clean_train, y_clean_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [82]:
y_clean_pred = logistic_regression_test(logisticRegr, X_clean_test)

In [83]:
X_clean_test.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capitalgain       0
capitalloss       0
hoursperweek      0
native-country    0
dtype: int64

In [84]:
y_clean_pred

array([[0.97632446, 0.02367554],
       [0.99607419, 0.00392581],
       [0.93682327, 0.06317673],
       ...,
       [0.6065507 , 0.3934493 ],
       [0.53420756, 0.46579244],
       [0.81041552, 0.18958448]])

In [85]:
clean_diff = average_diff(y_clean_pred, X_clean_test)

In [86]:
clean_diff

0.21782262696800592

In [87]:
clean_test_score = logisticRegr.score(X_clean_test, y_clean_test)

In [88]:
clean_test_score

0.8263895997543249

In [89]:
from tabulate import tabulate
data = [["Impute Missing Values (Mode)","Dirty test data","Label Encoding", "Logistic Regression", dirty_test_score, dirty_diff],
["Impute Missing Values (Mode) ", "Clean test data", "Label Encoding", "Logistic Regression", clean_test_score, clean_diff]]
print (tabulate(data, headers=["Method", "Test Data", "Encoding Technique", "ML Model", "Accuracy", "Fairness"]))

Method                        Test Data        Encoding Technique    ML Model               Accuracy    Fairness
----------------------------  ---------------  --------------------  -------------------  ----------  ----------
Impute Missing Values (Mode)  Dirty test data  Label Encoding        Logistic Regression    0.826799    0.215819
Impute Missing Values (Mode)  Clean test data  Label Encoding        Logistic Regression    0.82639     0.217823
