In [287]:
# Dependencies
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from pathlib import Path

In [288]:
# Copy the file path to reading 

file = Path(r'Source\adult.csv')

In [289]:
# Reading the CSV file in to the Pandas DataFrame 
df = pd.read_csv(file)
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


Data Exploration and data cleaning

In [290]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [291]:
# explore the data
df.shape

(32561, 15)

In [292]:
# # Drop extraneous column
column_to_drop = ["fnlwgt","capital.gain","capital.loss","relationship","workclass","race"]
df.drop(columns=column_to_drop,inplace=True)


In [293]:
#  Drop all rows with missing information
df.replace("?", pd.NA, inplace=True)

In [294]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,age,education,education.num,marital.status,occupation,sex,hours.per.week,native.country,income
1,82,HS-grad,9,Widowed,Exec-managerial,Female,18,United-States,<=50K
3,54,7th-8th,4,Divorced,Machine-op-inspct,Female,40,United-States,<=50K
4,41,Some-college,10,Separated,Prof-specialty,Female,40,United-States,<=50K
5,34,HS-grad,9,Divorced,Other-service,Female,45,United-States,<=50K
6,38,10th,6,Separated,Adm-clerical,Male,40,United-States,<=50K


In [295]:
# Verify dropped rows
df.shape

(30162, 9)

In [296]:
# Find the levels of education
df["education"].unique()

array(['HS-grad', '7th-8th', 'Some-college', '10th', 'Doctorate',
       'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-voc',
       '1st-4th', '5th-6th', 'Assoc-acdm', '12th', '9th', 'Preschool'],
      dtype=object)

In [297]:
# Drop education column(education and education.num has same measure)
df = df.drop(columns=["education"])
df.head()

Unnamed: 0,age,education.num,marital.status,occupation,sex,hours.per.week,native.country,income
1,82,9,Widowed,Exec-managerial,Female,18,United-States,<=50K
3,54,4,Divorced,Machine-op-inspct,Female,40,United-States,<=50K
4,41,10,Separated,Prof-specialty,Female,40,United-States,<=50K
5,34,9,Divorced,Other-service,Female,45,United-States,<=50K
6,38,6,Separated,Adm-clerical,Male,40,United-States,<=50K


In [298]:
# find types of occupation
df["occupation"].unique()

array(['Exec-managerial', 'Machine-op-inspct', 'Prof-specialty',
       'Other-service', 'Adm-clerical', 'Transport-moving', 'Sales',
       'Craft-repair', 'Farming-fishing', 'Tech-support',
       'Protective-serv', 'Handlers-cleaners', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [299]:
#  convert to numerical data
df["occupation"]=df["occupation"].replace({'Exec-managerial':1, 'Machine-op-inspct':2, 'Prof-specialty':3,
       'Other-service':4, 'Adm-clerical':5, 'Transport-moving':6, 'Sales':7,
       'Craft-repair':8, 'Farming-fishing':8, 'Tech-support':8,
       'Protective-serv':9, 'Handlers-cleaners':10, 'Armed-Forces':11,
       'Priv-house-serv':12})

In [300]:
# marritial status unique values
df["marital.status"].unique()

array(['Widowed', 'Divorced', 'Separated', 'Never-married',
       'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
      dtype=object)

In [301]:
#  Convert catergorical data to numerical value
df["marital.status"] = df["marital.status"].replace({"Widowed":1,"Divorced":2,"Separated":3,"Never-married":4,"Married-civ-spouse":5,"Married-spouse-absent":6,"Married-AF-spouse":7})

In [302]:
#  Find out the different countries
df["native.country"].unique()

array(['United-States', 'Mexico', 'Greece', 'Vietnam', 'China', 'Taiwan',
       'India', 'Philippines', 'Trinadad&Tobago', 'Canada', 'South',
       'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran', 'England',
       'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba', 'Ireland',
       'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic', 'Haiti',
       'Hungary', 'Columbia', 'Guatemala', 'El-Salvador', 'Jamaica',
       'Ecuador', 'France', 'Yugoslavia', 'Portugal', 'Laos', 'Thailand',
       'Outlying-US(Guam-USVI-etc)', 'Scotland'], dtype=object)

In [303]:
# Replace value with labelencoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["native.country"] = label_encoder.fit_transform(df["native.country"])


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [304]:
df["sex"].unique()

array(['Female', 'Male'], dtype=object)

In [305]:
df["sex"] = df["sex"].replace({"Female":0,"Male":1})

In [306]:
df.tail ()

Unnamed: 0,age,education.num,marital.status,occupation,sex,hours.per.week,native.country,income
32556,22,10,4,9,1,40,38,<=50K
32557,27,12,5,8,0,38,38,<=50K
32558,40,9,5,2,1,40,38,>50K
32559,58,9,1,5,0,40,38,<=50K
32560,22,9,4,5,1,20,38,<=50K


In [307]:
# find the unique values of education.num
df["education.num"].unique()

array([ 9,  4, 10,  6, 16, 15, 13, 14,  7, 11,  2,  3, 12,  8,  5,  1],
      dtype=int64)

In [308]:
# convert income data into binary values
df_encoded = pd.get_dummies(df["income"])
df_final = pd.concat([df,df_encoded],axis = 1)
df_final.head()

Unnamed: 0,age,education.num,marital.status,occupation,sex,hours.per.week,native.country,income,<=50K,>50K
1,82,9,1,1,0,18,38,<=50K,True,False
3,54,4,2,2,0,40,38,<=50K,True,False
4,41,10,3,3,0,40,38,<=50K,True,False
5,34,9,2,4,0,45,38,<=50K,True,False
6,38,6,3,5,1,40,38,<=50K,True,False


ML Model - Logistic Regression


Split the data into X and Y and then into testing and training set

In [309]:
# Split the data into X(features) and y (income)

#  the y variable focus on the income
y = df["income"]

#  The x variable include all features except income
X = df.drop(columns=["income"])

In [310]:
#  Split into testing and training sets using train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

Fit a logistic regression classifier

In [311]:
# Declare a logistic regression model
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(random_state=42)

# Fit and save the logistic regression model using the training data
model = logistic_regression_model.fit(X_train,y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Create the predicted value for the testing and training data

In [312]:
#  Generate training predictions
training_predictions = model.predict(X_train)

#  Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Print the confusion matrix for the training data

In [313]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train,training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[[15655  1285]
 [ 3035  2646]]


Print a confusion matrix for the testing data

In [314]:
#  Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test,testing_predictions)

#  Print the confusion matrix for testing data
print(test_matrix)

[[5321  393]
 [1010  817]]


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Print the training classification report

In [315]:
# Print the training classification report
from sklearn.metrics import classification_report
training_report = classification_report(y_train,training_predictions)
print(training_report)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


              precision    recall  f1-score   support

       <=50K       0.84      0.92      0.88     16940
        >50K       0.67      0.47      0.55      5681

    accuracy                           0.81     22621
   macro avg       0.76      0.69      0.71     22621
weighted avg       0.80      0.81      0.80     22621



Print the testing classification report

In [316]:
# Print the testing classification report
testing_report = classification_report(y_test,testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

       <=50K       0.84      0.93      0.88      5714
        >50K       0.68      0.45      0.54      1827

    accuracy                           0.81      7541
   macro avg       0.76      0.69      0.71      7541
weighted avg       0.80      0.81      0.80      7541



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
