
#### Applying a classification algorithm into real world data with the purpose of identifying whether someone makes over 50k euros a year or not.
Dataset acquired from: https://archive.ics.uci.edu/ml/datasets/Adult

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
##
from sklearn.model_selection import train_test_split
from sklearn import tree
##
from sklearn.neighbors import KNeighborsClassifier
##
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
##
from sklearn.ensemble import RandomForestClassifier

## Insertion of the dataset and data exploration.

### Making two copies of the same dataset.

In [2]:
ab=pd.read_csv('data/adult.csv',';')

In [3]:
ba=pd.read_csv('data/adult.csv',';')

##### Checking for null values.

In [4]:
ab.isnull().any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
money             False
dtype: bool

##### Showing dataset's shape.

In [5]:
ab.shape

(32561, 15)

In [6]:
ba.shape

(32561, 15)

##### Showing the first five entries of the dataset.

In [7]:
ab.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,money
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


##### Column names.

In [8]:
ab.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'money'],
      dtype='object')

##### Replacing the values of type Object with unique int32 values and creating a new column with those values to assist later into the data analysis of the dataset. This procedure is done on both tables.

In [9]:
workclassString = ab.workclass
workclassVal = ab.workclass.unique()
workclassValCount = len(workclassVal)
workclass_dict = {}
for i in range(0,workclassValCount):
    workclass_dict[workclassVal[i]] = i
ab["workclass_c"] = ab.workclass.map(workclass_dict).astype(int)
############################################################
workclassString = ba.workclass
workclassVal = ba.workclass.unique()
workclassValCount = len(workclassVal)
workclass_dict = {}
for i in range(0,workclassValCount):
    workclass_dict[workclassVal[i]] = i
ba["workclass_c"] = ba.workclass.map(workclass_dict).astype(int)

In [10]:
educationString = ab.education
educationVal = ab.education.unique()
educationValCount = len(educationVal)
education_dict = {}
for i in range(0,educationValCount):
    education_dict[educationVal[i]] = i
ab["education_c"] = ab.education.map(education_dict).astype(int)
############################################################
educationString = ba.education
educationVal = ba.education.unique()
educationValCount = len(educationVal)
education_dict = {}
for i in range(0,educationValCount):
    education_dict[educationVal[i]] = i
ba["education_c"] = ba.education.map(education_dict).astype(int)

In [None]:
marital_statusString = ab.marital_status
marital_statusVal = ab.marital_status.unique()
marital_statusValCount = len(marital_statusVal)
marital_status_dict = {}
for i in range(0,marital_statusValCount):
    marital_status_dict[marital_statusVal[i]] = i
ab["marital_status_c"] = ab.marital_status.map(marital_status_dict).astype(int)
############################################################
marital_statusString = ba.marital_status
marital_statusVal = ba.marital_status.unique()
marital_statusValCount = len(marital_statusVal)
marital_status_dict = {}
for i in range(0,marital_statusValCount):
    marital_status_dict[marital_statusVal[i]] = i
ba["marital_status_c"] = ba.marital_status.map(marital_status_dict).astype(int)

In [None]:
occupationString = ab.occupation
occupationVal = ab.occupation.unique()
occupationValCount = len(occupationVal)
occupation_dict = {}
for i in range(0,occupationValCount):
    occupation_dict[occupationVal[i]] = i
ab["occupation_c"] = ab.occupation.map(occupation_dict).astype(int)
############################################################
occupationString = ba.occupation
occupationVal = ba.occupation.unique()
occupationValCount = len(occupationVal)
occupation_dict = {}
for i in range(0,occupationValCount):
    occupation_dict[occupationVal[i]] = i
ba["occupation_c"] = ba.occupation.map(occupation_dict).astype(int)

In [None]:
relationshipString = ab.relationship
relationshipVal = ab.relationship.unique()
relationshipValCount = len(relationshipVal)
relationship_dict = {}
for i in range(0,relationshipValCount):
    relationship_dict[relationshipVal[i]] = i
ab["relationship_c"] = ab.relationship.map(relationship_dict).astype(int)
############################################################
relationshipString = ba.relationship
relationshipVal = ba.relationship.unique()
relationshipValCount = len(relationshipVal)
relationship_dict = {}
for i in range(0,relationshipValCount):
    relationship_dict[relationshipVal[i]] = i
ba["relationship_c"] = ba.relationship.map(relationship_dict).astype(int)

In [None]:
raceString = ab.race
raceVal = ab.race.unique()
raceValCount = len(raceVal)
race_dict = {}
for i in range(0,raceValCount):
    race_dict[raceVal[i]] = i
ab["race_c"] = ab.race.map(race_dict).astype(int)
############################################################
raceString = ba.race
raceVal = ba.race.unique()
raceValCount = len(raceVal)
race_dict = {}
for i in range(0,raceValCount):
    race_dict[raceVal[i]] = i
ba["race_c"] = ba.race.map(race_dict).astype(int)

In [None]:
sexString = ab.sex
sexVal = ab.sex.unique()
sexValCount = len(sexVal)
sex_dict = {}
for i in range(0,sexValCount):
    sex_dict[sexVal[i]] = i
ab["sex_c"] = ab.sex.map(sex_dict).astype(int)
############################################################
sexString = ba.sex
sexVal = ba.sex.unique()
sexValCount = len(sexVal)
sex_dict = {}
for i in range(0,sexValCount):
    sex_dict[sexVal[i]] = i
ba["sex_c"] = ba.sex.map(sex_dict).astype(int)

In [None]:
moneyString = ab.money
moneyVal = ab.money.unique()
moneyValCount = len(moneyVal)
money_dict = {}
for i in range(0,moneyValCount):
    money_dict[moneyVal[i]] = i
ab["money_c"] = ab.money.map(money_dict).astype(int)
############################################################
moneyString = ba.money
moneyVal = ba.money.unique()
moneyValCount = len(moneyVal)
money_dict = {}
for i in range(0,moneyValCount):
    money_dict[moneyVal[i]] = i
ba["money_c"] = ba.money.map(money_dict).astype(int)

##### Making column names easier to handle. 

In [None]:
ab.rename(columns={'marital-status':'marital_status'}, inplace=True)
ab.rename(columns={'education-num':'education_num'}, inplace=True)
ab.rename(columns={'capital-gain':'capital_gain'}, inplace=True)
ab.rename(columns={'capital-loss':'capital_loss'}, inplace=True)
ab.rename(columns={'hours-per-week':'hours_per_week'}, inplace=True)
ab.rename(columns={'native-country':'native_country'}, inplace=True)
############################################################
ba.rename(columns={'marital-status':'marital_status'}, inplace=True)
ba.rename(columns={'education-num':'education_num'}, inplace=True)
ba.rename(columns={'capital-gain':'capital_gain'}, inplace=True)
ba.rename(columns={'capital-loss':'capital_loss'}, inplace=True)
ba.rename(columns={'hours-per-week':'hours_per_week'}, inplace=True)
ba.rename(columns={'native-country':'native_country'}, inplace=True)

##### Showing the reformed tables.

In [None]:
ab.head()

In [None]:
ab.columns

In [None]:
ab.dtypes

##### Deleting unnecessary columns.

In [None]:
ab.drop(labels = ['fnlwgt'], axis = 1, inplace = True)

In [None]:
ab.head()

In [None]:
ba.drop(labels = ['fnlwgt','workclass','education','marital_status','occupation','relationship','race','sex',
                  'native_country','money'],axis = 1, inplace=True)

In [None]:
ba.head()

In [None]:
ba.columns

##### Data visualization through a histogram.

In [None]:
ba.hist(bins=25, figsize=(15, 15))

### Data splitting.

##### A correlation matrix too assist me into picking the right "in's" for my purposes.

In [None]:
correlation_matrix = ba.corr().round(2)
f, ax = plt.subplots(figsize=(10,10))
#### Χρήση απεικόνισης μορφής heatmap της βιβλιοθήκης seaborn 
sns.heatmap(data=correlation_matrix, annot=True,cmap='YlGnBu')

##### My y_vector is "money_c". My x_vector contains all those column with a correlation percentage >=0.22. 

In [None]:
X = ba[['education_num', 'capital_gain', 'hours_per_week','age']]
y= ba['money_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 17)

In [None]:
print("X_train dimensions:", X_train.shape)
print("y_train dimensions:", y_train.shape)

print("X_test dimensions:", X_test.shape)
print("y_test dimensions:", y_test.shape)

### Choosing the classification algrorithm: Decision Tree Classifier.

In [None]:
dtree = tree.DecisionTreeClassifier("gini")

#### Training the model.

In [None]:
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)

In [None]:
type(y_pred)

In [None]:
print("Classification's percentage of success:")
print(np.sum(y_pred == y_test) / float(len(y_test)))

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

##### Even though the percentage was good enough it is still worth checking other algorithms.
##### (Changing the method from gini to entropy did not change the percentage)

### KNN Classifier

In [None]:
knnClass = KNeighborsClassifier(n_neighbors=7)

In [None]:
knnClass.fit(X_train, y_train)

In [None]:
y_pred = knnClass.predict(X_test)

In [None]:
print("Classification's percentage of success:")
print(np.sum(y_pred == y_test) / float(len(y_test)))

##### We are getting a better percentage of classifaction (0.82 instead of 0.81) having 8 neighbours instead of 7, but keeping in mind that we don't want to overfit our model, we are keeping it as is.

In [None]:
confusion_matrix(y_test, y_pred)

##### Precision can take prices from 1 to 0 (while 1 is the best we can get)
##### Ours is 0.66, so that's quite good.

In [None]:
precision_score(y_test, y_pred)

##### Same goes for our recall value, but here we can see that our classifier is not that good in pointing out the true positive samples, with a value of 0.48.

In [None]:
recall_score(y_test, y_pred) 

In [None]:
f1_score(y_test, y_pred)

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

### Applying Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=250)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

##### Slightly worse percentage of success and so far KNN looks better.

In [None]:
print("Classification's percentage of success:")
print(np.sum(y_pred == y_test) / float(len(y_test)))

In [None]:
f1_score(y_test, y_pred)

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
rfc.feature_importances_

In [None]:
features = ['education_num', 'capital_gain', 'hours_per_week', 'age']
importances = rfc.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Having tried three different classification algorithms we can confidently say that KNN performed better.