# Classification in Python

## Outline / Steps to do the analysis

* Load the necessary packages into memory
* Use the "official classification" and create the labels for personality traits for every person in the dataset
* Train the models to be able to identify personalities of new individuals using:
    * Knn
    * Simple Tree
    * Random Forest
    * XGBoost
    * SVM
    
* Run classification for Alex's personality using the above-mentioned models

Let's start by loading in modules and data

In [9]:
# The following two lines change the way
# the notebook is displayed
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Now we load the important functions
import pandas as pd #For working with dataframes
import numpy as np #For working with arrays, pandas is built on top of this --> makes things look nicer.
from pprint import pprint #For printing things nicely
import matplotlib.pyplot as plt #For plotting
import seaborn as sns

# Now we can load the data
class_df = pd.read_csv('personality_tests.csv').iloc[4:,] #'iloc' pulls rows starting at row 4 (which is actually the 5th row since here everything starts from 0)
alex_df = pd.read_csv('alexahpenev_personality.csv')
display(class_df.head()) #head ==> the first 5 rows
display(alex_df.head())

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,X10,...,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134
4,2/20/20 12:35,2/20/20 12:35,1,100,36,1,2/20/20 12:35,preview,EN,,...,,,,,,,2,2,3.0,
5,2/20/20 15:13,2/20/20 15:24,0,100,648,1,2/20/20 15:24,anonymous,EN,3.0,...,2.0,1.0,1.0,2.0,2.0,32.0,2,1,,
6,2/20/20 15:16,2/20/20 15:24,0,100,495,1,2/20/20 15:24,qr,EN,4.0,...,1.0,2.0,1.0,1.0,1.0,22.0,2,1,,
7,2/20/20 15:13,2/20/20 15:25,0,100,718,1,2/20/20 15:25,anonymous,EN,4.0,...,2.0,2.0,2.0,2.0,2.0,23.0,1,1,,
8,2/20/20 15:16,2/20/20 15:25,0,100,580,1,2/20/20 15:25,anonymous,EN,2.0,...,2.0,2.0,2.0,1.0,2.0,30.0,2,1,,


Unnamed: 0,Q1,Q4,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,...,Q116,Q117,Q118,Q119,Q120,Q121,Q122,Q123,Q124,Q125
0,2,1,2,5,4,2,4,5,1,1,...,1,1,1,2,2,2,1,1,1,1


Let's separate our class data into two datasets. One for Big Five questions and one for MBTI. We'll convert all of the columns to numeric fields.

In [11]:
# We separate the data into two separate questionnaires, 
# one for Big5 and one for MBTI

bigfive_df = class_df.copy().loc[:,'Q1':'Q54'] #'.loc' ==> seperate bigfive // '.copy' ==> we create a copy of the dataset
mbti_df = class_df.copy().loc[:,'Q55':'Q125'] #':,' means to take every single note 
mbti_df.drop(labels= 'Q59', axis=1, inplace=True) #I choose to drop Q59 because is the same question as the Q60

#The "loc" function allows us to look up rows/columns by name 
#"iloc" let's us look up rows/columns by index

bigfive_df = bigfive_df.apply(pd.to_numeric, axis=1)
mbti_df = mbti_df.apply(pd.to_numeric, axis=1)

print('Big Five Dimensions Before Dropping {}'.format(bigfive_df.shape))
bigfive_df.dropna(how='any', inplace=True)#we drop 'na'. // 'inplace= True' ovewrites the original dataset.
print('Big Five Dimensions After Dropping {}'.format(bigfive_df.shape))

print('MBTI Dimensions Before Dropping {}'.format(mbti_df.shape))
mbti_df.dropna(how='any', inplace=True)
print('MBTI Dimensions After Dropping {}'.format(mbti_df.shape))

display(bigfive_df)
display(mbti_df)

KeyError: 'Q1'

In [None]:
# In this chunk, I will use the provided formulas to calculate
# personality traits. This is different from letting the data
# speak for itself, hence why it is called supervised learning


bigfive_df['E'] = 20 + bigfive_df[['Q1','Q14','Q24','Q34','Q45']].sum(axis=1) - bigfive_df[['Q9','Q19','Q29','Q39','Q50']].sum(axis=1)
bigfive_df['A'] = 14 + bigfive_df[['Q10','Q20','Q30','Q40','Q46','Q51']].sum(axis=1) - bigfive_df[['Q4','Q15','Q25','Q35']].sum(axis=1)
bigfive_df['C'] = 14 + bigfive_df[['Q6','Q16','Q26','Q36','Q47','Q52']].sum(axis=1) - bigfive_df[['Q11','Q21','Q31','Q41','Q52']].sum(axis=1)
bigfive_df['N'] = 38 + bigfive_df[['Q12','Q22']].sum(axis=1) - bigfive_df[['Q7','Q17','Q27','Q32','Q37','Q42','Q48','Q53']].sum(axis=1)
bigfive_df['O'] = 8 + bigfive_df[['Q8','Q18','Q28','Q38','Q43','Q49','Q54']].sum(axis=1) - bigfive_df[['Q13','Q23','Q33']].sum(axis=1)

#Let's look at the summary statistics for Extroversion
display(bigfive_df['E'].describe())

#Create a boolean field that indicates whether the subject is extroverted based on the 50% cutoff
bigfive_df['is_extroverted'] = bigfive_df['E'].apply(lambda x: 1 if x >= 24 else 0)
display(bigfive_df.head())

#Do the same for alex
alex_df['E'] = 20 + alex_df[['Q1','Q14','Q24','Q34','Q45']].sum(axis=1) - alex_df[['Q9','Q19','Q29','Q39','Q50']].sum(axis=1)
alex_df['A'] = 14 + alex_df[['Q10','Q20','Q30','Q40','Q46','Q51']].sum(axis=1) - alex_df[['Q4','Q15','Q25','Q35']].sum(axis=1)
alex_df['C'] = 14 + alex_df[['Q6','Q16','Q26','Q36','Q47','Q52']].sum(axis=1) - alex_df[['Q11','Q21','Q31','Q41','Q52']].sum(axis=1)
alex_df['N'] = 38 + alex_df[['Q12','Q22']].sum(axis=1) - alex_df[['Q7','Q17','Q27','Q32','Q37','Q42','Q48','Q53']].sum(axis=1)
alex_df['O'] = 8 + alex_df[['Q8','Q18','Q28','Q38','Q43','Q49','Q54']].sum(axis=1) - alex_df[['Q13','Q23','Q33']].sum(axis=1)

alex_df['is_extroverted'] = alex_df['E'].apply(lambda x: 1 if x >= 24 else 0)#'lambda' function does something specific (7:57 pm, 3 Mar)

display(alex_df)

In [None]:
# This is a function to split our data into a train set and a test set

from sklearn.model_selection import train_test_split

bf_features = bigfive_df.loc[:,'Q1':'Q54']
bf_target = bigfive_df['is_extroverted']

x_train, x_test, y_train, y_test = train_test_split(bf_features, bf_target, test_size = 0.2, random_state = 42) 
#this command requires several arguments. bf_features list all the questions. bf_target is the dependent variable (aka in this case, whether someone is extroverted or not)
#this command returns four things (in R you can only return one) --> train dataset, test for that dataset, train..., test...


In [None]:
# Here we apply the Knn algorithm on a range of different k's.
# This way we can see which K is the best for our data.

from sklearn.neighbors import KNeighborsClassifier as knn

max_neighbors = 10

k_outputs = {
    'k' : [],
    'accuracy_score_raw' : [],
    'accuracy_score' : []
}

for num_neighbors in range(1, max_neighbors+1):
    knn_model = knn(n_neighbors = num_neighbors)
    knn_model.fit(x_train, y_train)#'x_train' all variables, 'y_train' all outcomes
    test_predictions = knn_model.predict(x_test)
    accuracy_score_raw = knn_model.score(x_test, y_test)
    accuracy_score = "{:.2%}".format(accuracy_score_raw)
    k_outputs['k'].append(num_neighbors)
    k_outputs['accuracy_score'].append(accuracy_score)
    k_outputs['accuracy_score_raw'].append(accuracy_score_raw)
    
k_selection_df = pd.DataFrame(k_outputs)

display(k_selection_df)
k_selection_df.plot(x='k', y='accuracy_score_raw', xlabel='Neighbor Count (k)', ylabel = 'Accuracy Score')

We can see from the above that the best number to use for K is simply 1. Let's look at a few more helpful evaluation metrics for our single neighbor model:

In [None]:
knn_model = knn(n_neighbors = 1) #we are creating a knn object and we set the number of neighbours equal to 1.
knn_model.fit(x_train, y_train) #.fit would know what to do i.e. apply the best solution depending on what object we use. i.e. it will fit a knn algorithm

test_predictions = knn_model.predict(x_test) #the predict command helps us make predictions

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 

#Code for confusion matrix with raw counts
cm = confusion_matrix(y_test, test_predictions, labels=knn_model.classes_) #this confusion matrix needs the ... then we take our predictions and then we specify that classes we need.
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=knn_model.classes_)
disp.plot(cmap='cividis')
plt.title('Confusion Matrix of Observation Counts')
plt.show()

#Code for confusion matrix with proportions
cm = confusion_matrix(y_test, test_predictions, labels=knn_model.classes_,normalize='all')
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=knn_model.classes_)
disp.plot(cmap='cividis')
plt.title('Normalized Confusion Matrix')
plt.show()

Let's look at the closest neighbors for our test dataset!

In [None]:
neighbors = knn_model.kneighbors(x_test, n_neighbors = 1)
idxs = list(neighbors[1].flatten())
closest_n = x_train.iloc[idxs,]

eval_df = x_test.copy()
eval_df['closest_neighbor_idx'] = closest_n.index
display(eval_df)

In [None]:
pd.get_option("display.max_columns")

display(bigfive_df.loc[[98,10],['Q1','Q14','Q24','Q34','Q45','Q9','Q19','Q29','Q39','Q50', 'is_extroverted']])
display(bigfive_df.loc[[48,87],['Q1','Q14','Q24','Q34','Q45','Q9','Q19','Q29','Q39','Q50', 'is_extroverted']])

Next we'll try to evaluate some other classification models, but since it will be the same process for the other models, we will also write a function!

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC #i.e. Support Vector Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

In [None]:
# Decision trees
clf = DecisionTreeClassifier(max_depth = 3, random_state = 42) #max_depth --> the max number of tree is allowed to have. random_state--> you specify a random state
clf.fit(x_train, y_train)

print(clf.predict(x_test))

tree.export_graphviz(clf, out_file="tree.dot",feature_names = x_train.columns,filled = True, proportion = True, class_names = ['Introverted','Extroverted'], leaves_parallel = True)

display(pd.DataFrame(y_train).value_counts())

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)



In [None]:
def evaluate_big5model(data, model, test_size = 0.2, normalize_cf = False, prediction_data = None):
    if model not in  ['knn','randomforest','boost','svm','decisiontree']:
        raise ValueError('Model {} not recognized. Model must be one of the following: {}'.format(model, ['knn','randomforest','boost','svm','decisiontree']))
#with that function, we can run evaluations with one line (see ln[12])  

    features = data.loc[:,'Q1':'Q54']
    target = data['is_extroverted']
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = test_size, random_state = 42)
    
    init_model = None
    
    if model == 'knn':
        init_model = knn(n_neighbors = 1)
    elif model == 'randomforest':
        init_model = RandomForestClassifier()
    elif model == 'boost':
        init_model = GradientBoostingClassifier()
    elif model == 'svm':
        init_model = SVC()
    elif model == 'decisiontree':
        init_model = DecisionTreeClassifier()
        
    fitted_model = init_model.fit(x_train ,y_train)
    test_predictions = fitted_model.predict(x_test)
    accuracy_score = fitted_model.score(x_test,y_test)
    
    cm = confusion_matrix(y_test, test_predictions, labels=fitted_model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=fitted_model.classes_)
    disp.plot(cmap='cividis')
    plt.title('Confusion Matrix of Observation Counts for {}'.format(model))
    plt.show()
    
    print("Accuracy Score for {}: {:.2%}".format(model, accuracy_score))
    
    if isinstance(prediction_data, type(None)) == False:
        alex_features = prediction_data.loc[:,'Q1':'Q54']
        alex_target = prediction_data['is_extroverted']
        print('Alex Extroversion Prediction: {}'.format('Not Extroverted' if fitted_model.predict(alex_features)[0] == 0 else 'Extroverted'))
    

In [None]:
evaluate_big5model(bigfive_df, model='knn', prediction_data = alex_df)
evaluate_big5model(bigfive_df, model='decisiontree', prediction_data = alex_df)
evaluate_big5model(bigfive_df, model='randomforest', prediction_data = alex_df)
evaluate_big5model(bigfive_df, model='boost', prediction_data = alex_df)
evaluate_big5model(bigfive_df, model='svm', prediction_data = alex_df)

#decision tree and random forest happen to generate the same prediction here. Based on the info we have we choose decision trees instead of random forests because its simpler.