age
sex
body_type (should combine some values)
height (drop too high and too low)
job
location (transform into a boolean livels_in_sanfrancisco)
sign ("add prefer not to say to missing data")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OrdinalEncoder

# Preprocessing the data and training the model

In [None]:
raw_dataset = pd.read_csv('../data/okcupid.csv') 
okcupid_profiles = raw_dataset.drop(columns="Unnamed: 0") 

As we can see from the output below, almost every column contains object types, which we can not use to fit the Random Forest. 
We need to convert the objects into numbers, and we can do that using OrdinalEncoder from sklearn.
We need to manage the missing data first tho

In [None]:
okcupid_profiles.dtypes

## Reworking the dataset
### Dropping columns

We noticed that for many labels, the value distribution is... a bit unusable. We have like 50000 occurences of a single value, and too few of the others.
We decided to solve this by introducing some changes to the dataset.

We are going to drop the following columns:

In [None]:
for element in okcupid_profiles.columns:
    print(element)

In [None]:
okcupid_profiles = okcupid_profiles.drop(columns = ['status', 
                                                     'orientation',
                                                     'diet',
                                                     'drinks',
                                                     'drugs',
                                                     'education',
                                                     'ethnicity',
                                                     'income',
                                                     'last_online',
                                                     'offspring',
                                                     'pets',
                                                     'smokes',
                                                     'speaks'])

Now we are left with:

In [None]:
for element in okcupid_profiles.columns:
    print(element)

### Handling 'religion'

In [None]:
pd.isna(okcupid_profiles["religion"]).sum()

In [None]:
okcupid_profiles['religion'].value_counts()

As we can see from the value count, the data is a bit too sparse. We are going to merge all the various religion occurences together.

In [None]:
# in the religion column, find values containing the word "christian" and replace them with "christian"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity but not too serious about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity and very serious about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity and somewhat serious about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity and laughing about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity and somewhat serious about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity and very serious about it', 'christian')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('christianity', 'christian')

In [None]:
# in the religion column, find values containing the word "agnosticism" and replace them with "agnostic"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism but not too serious about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism and very serious about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism and somewhat serious about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism and laughing about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism and somewhat serious about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism and very serious about it', 'agnostic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('agnosticism', 'agnostic')

In [None]:
# in the religion column, find values containing the word "atheism" and replace them with "atheist"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism but not too serious about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism and very serious about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism and somewhat serious about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism and laughing about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism and somewhat serious about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism and very serious about it', 'atheist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('atheism', 'atheist')

In [None]:
# in the religion column, find values containing the word "catholicism" and replace them with "catholic"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism but not too serious about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism and very serious about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism and somewhat serious about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism and laughing about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism and somewhat serious about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism and very serious about it', 'catholic')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('catholicism', 'catholic')

In [None]:
# in the religion column, find values containing the word "judaism" and replace them with "jewish"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism but not too serious about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism and very serious about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism and somewhat serious about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism and laughing about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism and somewhat serious about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism and very serious about it', 'jewish')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('judaism', 'jewish')

In [None]:
# in the religion column, find values containing the word "buddhism" and replace them with "buddhist"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism but not too serious about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism and very serious about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism and somewhat serious about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism and laughing about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism and somewhat serious about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism and very serious about it', 'buddhist')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('buddhism', 'buddhist')

In [None]:
# in the religion column, find values containing the word "hinduism" and replace them with "hindu"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism but not too serious about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism and very serious about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism and somewhat serious about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism and laughing about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism and somewhat serious about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism and very serious about it', 'hindu')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('hinduism', 'hindu')

In [None]:
# in the religion column, find values containing the word "islam" and replace them with "muslim"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam but not too serious about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam and very serious about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam and somewhat serious about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam and laughing about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam and somewhat serious about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam and very serious about it', 'muslim')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('islam', 'muslim')

In [None]:
# in the religion column, find values containing the word "other" and replace them with "other"
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other but not too serious about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other and very serious about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other and somewhat serious about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other and laughing about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other and somewhat serious about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other and very serious about it', 'other')
okcupid_profiles['religion'] = okcupid_profiles['religion'].str.replace('other', 'other')

In [None]:
print(okcupid_profiles['religion'].value_counts())
print("Missing values: ", okcupid_profiles['religion'].isnull().sum())

At this point, we noticed that we have too many missing values, since we are also considering the 'other' as basically missing. So we are just going to have to convert the religion attribute to a boolean.

In [None]:
"""for the religion column, convert the missing data into a boolean false, 
convert the 'other' values into boolean false, and the rest boolean true
"""
okcupid_profiles['religion'] = okcupid_profiles['religion'].fillna(False)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('other', False)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('agnostic', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('atheist', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('christian', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('catholic', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('jewish', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('buddhist', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('hindu', True)
okcupid_profiles['religion'] = okcupid_profiles['religion'].replace('muslim', True)


In [None]:
okcupid_profiles.rename(columns = {'religion': 'religious'}, inplace=True)

In [None]:
okcupid_profiles['religious'].value_counts()

### Handling 'sign'

### Handling 'location'  

In [None]:
okcupid_profiles['location'].value_counts()

In [None]:
okcupid_profiles['location'] = okcupid_profiles['location'].fillna(False)
okcupid_profiles['location'] = (okcupid_profiles['location'] == 'san francisco, california')
okcupid_profiles.rename(columns = {'religion': 'lives_in_san_francisco'}, inplace=True)

In [None]:
okcupid_profiles['lives_in_san_francisco'].value_counts()

Now the religious column is a boolean

## Filling the missing data

The columns containing missing data are the following:

In [None]:
print(okcupid_profiles.isna().sum())

Comparing the missing data output with the dtype output, we can easily see how, except for height, all the missing data are categorical strings.

Since there are only three rows with missing values for height, instead of replacing the NaN with something like 0 or -1, or the average height, we think it's better to just drop them, since it is such a small number

In [None]:
okcupid_profiles = okcupid_profiles.dropna(how = 'any', subset = 'height') 

For all the others attributes, we will just replace the missing values with the 'missing' string.

In [None]:
okcupid_profiles = okcupid_profiles.fillna(value = 'missing')

And now all the columns contain something

In [None]:
print(okcupid_profiles.isna().sum())

## Encoding the data

In [None]:
enc = OrdinalEncoder()
enc.fit(okcupid_profiles)

In [None]:
encoded_data = enc.transform(okcupid_profiles)

Now we have a Numpy array with the encoded data, so no more objects, but only numbers.

In [None]:
encoded_data.dtype

# Random Forest

## Education prediction model

In [None]:
# remember that now we have a Numpy array

y = encoded_data[:,6] # This should pick the education column

X = encoded_data[:, [1,2,3,4,5,7,8,15,16]] # Pick income and job
# X = np.delete(encoded_data, 11, axis = 1) # This should remove the income colum

# test_size = 0.3   means 70% training set | 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [None]:
# n_estimators is the number of trees in the forest
rfc = RandomForestClassifier(n_estimators = 50)
rfc.fit(X_train, y_train)
rfc_prediction = rfc.predict(X_test)

In [None]:
print("Random Forest Classification report")
print(classification_report(y_test, rfc_prediction))
print("Random Forest Confusion Matrix")
print(confusion_matrix(y_test, rfc_prediction))

Let's now plot a proper confusion matrix

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, rfc_prediction)

matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis] # makes number into %
#matrix = matrix.astype('float')

# Build the plot
plt.figure(figsize = (15, 10))
sns.set(font_scale = 1.4)
sns.heatmap(matrix, 
            annot = True, 
            annot_kws = {'size':5},
            cmap = plt.cm.Blues, 
            linewidths = 0.2)

# Add labels to the plot
# class_names = np.array(okcupid_profiles["income"].unique())
# class_names = [-1, 80000, 20000, 40000, 30000, 50000, 60000, 1000000, 150000, 100000, 500000, 70000, 250000]
class_names = []

tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5

plt.xticks(tick_marks, class_names, rotation=90)
plt.yticks(tick_marks2, class_names, rotation=0)

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.title('Confusion Matrix for Education')
plt.show()

In [None]:
df['code'] = df['code'].replace(['1.0'],'red')