<a href="https://colab.research.google.com/github/MatthewK84/Python-Code/blob/main/Catboost_Classification_Census_Income_Adult.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Census Income Adult Data - Catboost Classification

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Train Data

In [2]:
column_names = ["age", "work_class", "final_weight", "education", "education_num",
                "marital_status", "occupation", "relationship", "race","sex","capital_gain",
               "capital_loss", "hours_per_week", "native_country","income_over_50k"]

In [4]:
train = pd.read_csv('/content/adult.data', names=column_names)

In [5]:
train.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   work_class       32561 non-null  object
 2   final_weight     32561 non-null  int64 
 3   education        32561 non-null  object
 4   education_num    32561 non-null  int64 
 5   marital_status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital_gain     32561 non-null  int64 
 11  capital_loss     32561 non-null  int64 
 12  hours_per_week   32561 non-null  int64 
 13  native_country   32561 non-null  object
 14  income_over_50k  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Data Cleaning - Train data

In [7]:
# Check the unique values
train['work_class'].unique()
# it seems there is a space before each element

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [8]:
# Let's remove the space at the beginning of the work_class column elements
train['work_class'] = train['work_class'].str.lstrip()
# Let's drop the question marks
train = train[train['work_class'] != '?']

In [9]:
# Let's remove the space at the beginning fpr education column
train['education'] = train['education'].str.lstrip()

In [10]:
# Let's remove the space at the beginning of the marital_status column
train['marital_status'] = train['marital_status'].str.lstrip()

In [11]:
# Let's remove the space at the beginning of the occupation column
train['occupation'] = train['occupation'].str.lstrip()
# Let's drop the columns with question marks
train = train[train['occupation'] != '?']

In [12]:
# Let's remove additional space of the relationship column
train['relationship'] = train['relationship'].str.lstrip()

In [13]:
# Let's remove additional space of the race column
train['race'] = train['race'].str.lstrip()

In [14]:
# Let's remove additional space of the sex column
train['sex'] = train['sex'].str.lstrip()

In [15]:
# Let's remove additional space and drop the columns with question marks from Native_country column
train['native_country'] = train['native_country'].str.lstrip()
train = train[train['native_country'] != '?']

In [16]:
# Let's remove additional space from income_over_50k column
train['income_over_50k'] = train['income_over_50k'].str.lstrip()
train['income_over_50k'].unique()

array(['<=50K', '>50K'], dtype=object)

In [17]:
# Convert income_over_50k values into 0s and 1s
train['income_over_50k'] = train['income_over_50k'].replace({'<=50K': 0, '>50K': 1})

In [18]:
train.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0


## Load Test Data

In [23]:
test = pd.read_csv('/content/adult.test',names=column_names)

In [24]:
test.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.


In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16282 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              16282 non-null  object 
 1   work_class       16281 non-null  object 
 2   final_weight     16281 non-null  float64
 3   education        16281 non-null  object 
 4   education_num    16281 non-null  float64
 5   marital_status   16281 non-null  object 
 6   occupation       16281 non-null  object 
 7   relationship     16281 non-null  object 
 8   race             16281 non-null  object 
 9   sex              16281 non-null  object 
 10  capital_gain     16281 non-null  float64
 11  capital_loss     16281 non-null  float64
 12  hours_per_week   16281 non-null  float64
 13  native_country   16281 non-null  object 
 14  income_over_50k  16281 non-null  object 
dtypes: float64(5), object(10)
memory usage: 1.9+ MB


In [26]:
# Remove the first row which is wrongly created
test = test.drop([0])

In [27]:
test.head(2)

Unnamed: 0,age,work_class,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_over_50k
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.


In [28]:
# Remove additional space at the beginning of the string
test['work_class'] = test['work_class'].str.lstrip()
test['education'] = test['education'].str.lstrip()
test['marital_status'] = test['marital_status'].str.lstrip()
test['occupation'] = test['occupation'].str.lstrip()
test['relationship'] = test['relationship'].str.lstrip()
test['race'] = test['race'].str.lstrip()
test['sex'] = test['sex'].str.lstrip()
test['native_country'] = test['native_country'].str.lstrip()
test['income_over_50k'] = test['income_over_50k'].str.lstrip()

In [29]:
# Let's drop the rows with question marks
test = test[test['work_class'] != '?']
test = test[test['occupation'] != '?']
test = test[test['native_country'] != '?']

In [30]:
# convert the target column values to 0s and 1s
test['income_over_50k'] = test['income_over_50k'].replace({'<=50K.': 0, '>50K.': 1})

In [31]:
test['income_over_50k'].unique()

array([0, 1])

### Dummy Variables

### Training Data

In [32]:
tn_work_class = pd.get_dummies(train['work_class'], prefix='wc')
tn_education = pd.get_dummies(train['education'], prefix='edu')
tn_marital_status = pd.get_dummies(train['marital_status'], prefix='mari')
tn_occupation = pd.get_dummies(train['occupation'], prefix='occup')
tn_relationship = pd.get_dummies(train['relationship'], prefix='rela')
tn_race = pd.get_dummies(train['race'], prefix='race')
tn_native_country = pd.get_dummies(train['native_country'], prefix='nat')

In [33]:
# Rename the sex variable to fit the new variable as Male or not
train.rename(columns={'sex':'male'}, inplace=True)

In [34]:
# Converting the male into 1 and Female into 0
train['male'] = train['male'].replace({'Male': 1, 'Female': 0})

In [35]:
# Let's drop the columns that we have already created Dummies of
col_drop = ['work_class','education','marital_status','occupation','relationship','race','native_country']
train = train.drop(col_drop, axis=1)

In [36]:
# join the Dummy variables with the DF
join_col = [tn_work_class,tn_education,tn_marital_status,tn_occupation,tn_relationship,tn_race,tn_native_country]
train = train.join(join_col)

In [37]:
train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Test Data

In [38]:
# Repeat the same and create Dummy Variables for for Test Data
ts_work_class = pd.get_dummies(test['work_class'], prefix='wc')
ts_education = pd.get_dummies(test['education'], prefix='edu')
ts_marital_status = pd.get_dummies(test['marital_status'], prefix='mari')
ts_occupation = pd.get_dummies(test['occupation'], prefix='occup')
ts_relationship = pd.get_dummies(test['relationship'], prefix='rela')
ts_race = pd.get_dummies(test['race'], prefix='race')
ts_native_country = pd.get_dummies(test['native_country'], prefix='nat')

In [39]:
test.rename(columns={'sex':'male'}, inplace=True)

In [40]:
test['male'] = test['male'].replace({'Male': 1, 'Female': 0})

In [41]:
col_drop = ['work_class','education','marital_status','occupation','relationship','race','native_country']
test = test.drop(col_drop, axis=1)

In [42]:
join_col1 = [ts_work_class,ts_education,ts_marital_status,ts_occupation,ts_relationship,ts_race,ts_native_country]
test = test.join(join_col1)

In [43]:
test.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
1,25,226802.0,7.0,1,0.0,0.0,40.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,89814.0,9.0,1,0.0,0.0,50.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


- We can notice that the training data has 104 columns whereas testing data has only 103 columns.
- Let's check which column

In [44]:
# Checking the column as not matching
test_cols = test.columns
train_cols = train.columns
different = []
for i in train_cols:
    if i not in test_cols:
        different.append(i)

different

['nat_Holand-Netherlands']

In [45]:
# Since nat_Holand-Netherlands is only available in training data,
# Lets, drop the rows that contains nat_Holand-Netherlands as 1
# And drop the nat_Holand-Netherlands column all together
train = train[train['nat_Holand-Netherlands'] != 1]
train = train.drop('nat_Holand-Netherlands', axis=1)

In [46]:
train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,income_over_50k,wc_Federal-gov,wc_Local-gov,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Split the Data

In [47]:
X_train = train.drop('income_over_50k', axis=1)
y_train = train['income_over_50k']
X_test = test.drop('income_over_50k', axis=1)
y_test = test['income_over_50k']

In [48]:
X_train.head(2)

Unnamed: 0,age,final_weight,education_num,male,capital_gain,capital_loss,hours_per_week,wc_Federal-gov,wc_Local-gov,wc_Private,...,nat_Portugal,nat_Puerto-Rico,nat_Scotland,nat_South,nat_Taiwan,nat_Thailand,nat_Trinadad&Tobago,nat_United-States,nat_Vietnam,nat_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [49]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(30161, 102)
(30161,)
(15060, 102)
(15060,)


## Create and Train the Model

In [None]:
pip install catboost

In [52]:
from catboost import CatBoostClassifier

In [53]:
cb_model = CatBoostClassifier()

In [54]:
cb_model.fit(X_train, y_train)

Learning rate set to 0.044122
0:	learn: 0.6528702	total: 67.9ms	remaining: 1m 7s
1:	learn: 0.6109392	total: 96.7ms	remaining: 48.3s
2:	learn: 0.5771867	total: 114ms	remaining: 37.7s
3:	learn: 0.5496175	total: 120ms	remaining: 30s
4:	learn: 0.5241046	total: 126ms	remaining: 25.1s
5:	learn: 0.5029332	total: 133ms	remaining: 22.1s
6:	learn: 0.4815514	total: 152ms	remaining: 21.6s
7:	learn: 0.4660043	total: 167ms	remaining: 20.7s
8:	learn: 0.4521362	total: 198ms	remaining: 21.8s
9:	learn: 0.4389135	total: 214ms	remaining: 21.2s
10:	learn: 0.4266623	total: 248ms	remaining: 22.3s
11:	learn: 0.4147645	total: 275ms	remaining: 22.6s
12:	learn: 0.4067218	total: 316ms	remaining: 24s
13:	learn: 0.3989361	total: 331ms	remaining: 23.3s
14:	learn: 0.3939790	total: 344ms	remaining: 22.6s
15:	learn: 0.3872617	total: 376ms	remaining: 23.1s
16:	learn: 0.3809178	total: 402ms	remaining: 23.3s
17:	learn: 0.3751015	total: 426ms	remaining: 23.3s
18:	learn: 0.3708988	total: 441ms	remaining: 22.8s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7c7365538730>

## Predict and Evaluate the Model

In [55]:
predict = cb_model.predict(X_test)

In [56]:
train_pred = cb_model.predict(X_train)

In [57]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [58]:
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, predict))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, predict))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, predict))

Train Accuracy score is:
0.8949305394383475
---------------------------------
Test Accuracy score is:
0.8701859229747676
---------------------------------
Confusion matrix:
[[10669   691]
 [ 1264  2436]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.92     11360
           1       0.78      0.66      0.71      3700

    accuracy                           0.87     15060
   macro avg       0.84      0.80      0.81     15060
weighted avg       0.87      0.87      0.87     15060

