In [200]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OrdinalEncoder

# Preprocessing the data and training the model

In [201]:
raw_dataset = pd.read_csv('../data/okcupid.csv') 
okcupid_profiles = raw_dataset.drop(columns="Unnamed: 0") 

As we can see from the output below, almost every column contains object types, which we can not use to fit the Random Forest. 
We need to convert the objects into numbers, and we can do that using OrdinalEncoder from sklearn.
We need to manage the missing data first tho

In [202]:
okcupid_profiles.dtypes

age              int64
status          object
sex             object
orientation     object
body_type       object
diet            object
drinks          object
drugs           object
education       object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
offspring       object
pets            object
religion        object
sign            object
smokes          object
speaks          object
dtype: object

## Filling the missing data

The columns containing missing data are the following:

In [203]:
print(okcupid_profiles.isna().sum())

age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
dtype: int64


Comparing the missing data output with the dtype output, we can easily see how, except for height, all the missing data are categorical strings. Let's handle this.

In [204]:
okcupid_profiles['height'] = okcupid_profiles['height'].fillna(value = 0) 

In [205]:
okcupid_profiles = okcupid_profiles.fillna(value = 'missing')

And now all the columns contain something

In [206]:
print(okcupid_profiles.isna().sum())

age            0
status         0
sex            0
orientation    0
body_type      0
diet           0
drinks         0
drugs          0
education      0
ethnicity      0
height         0
income         0
job            0
last_online    0
location       0
offspring      0
pets           0
religion       0
sign           0
smokes         0
speaks         0
dtype: int64


## Encoding the data

In [207]:
enc = OrdinalEncoder()
enc.fit(okcupid_profiles)

In [208]:
transformed_data = enc.transform(okcupid_profiles)

Now we have a Numpy array with the encoded data, so no more objects, but only numbers.

In [209]:
transformed_data.dtype

dtype('float64')

Let's convert the Numpy array to a Pandas dataframe

In [210]:
df_transformed = pd.DataFrame(transformed_data, 
                              columns = okcupid_profiles.columns)

In [211]:
df_transformed.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks
0,4.0,3.0,1.0,2.0,0.0,11.0,5.0,1.0,25.0,98.0,...,0.0,20.0,26756.0,175.0,2.0,14.0,3.0,16.0,2.0,1.0
1,17.0,3.0,1.0,2.0,2.0,7.0,3.0,3.0,31.0,216.0,...,7.0,8.0,28136.0,118.0,2.0,14.0,4.0,8.0,1.0,4596.0
2,20.0,0.0,1.0,2.0,11.0,0.0,5.0,0.0,12.0,189.0,...,0.0,12.0,24988.0,155.0,14.0,5.0,40.0,32.0,1.0,6334.0
3,5.0,3.0,1.0,2.0,11.0,18.0,5.0,0.0,25.0,216.0,...,1.0,19.0,26402.0,16.0,4.0,10.0,40.0,29.0,1.0,6481.0
4,11.0,3.0,1.0,2.0,1.0,3.0,5.0,1.0,9.0,22.0,...,0.0,0.0,25602.0,155.0,14.0,14.0,40.0,0.0,1.0,1.0


# Random Forest

In [212]:
y = df_transformed["income"]
X = df_transformed

# test_size = 0.3   means 70% training set | 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [213]:
# n_estimators is the number of trees in the forest
rfc = RandomForestClassifier(n_estimators = 50)
rfc.fit(X_train, y_train)
rfc_prediction = rfc.predict(X_test)

In [214]:
print("Random Forest Classification report")
print(classification_report(y_test, rfc_prediction))
print("Random Forest Confusion Matrix")
print(confusion_matrix(y_test, rfc_prediction))

Random Forest Classification report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     14535
         1.0       0.97      1.00      0.99       907
         2.0       0.90      0.98      0.94       309
         3.0       0.86      0.88      0.87       323
         4.0       0.80      0.89      0.85       283
         5.0       0.82      0.72      0.77       202
         6.0       0.96      0.78      0.86       224
         7.0       0.97      0.96      0.96       319
         8.0       0.92      0.99      0.96       454
         9.0       0.89      0.84      0.87       211
        10.0       0.74      0.47      0.57        43
        11.0       0.00      0.00      0.00        15
        12.0       0.88      0.81      0.84       159

    accuracy                           0.98     17984
   macro avg       0.82      0.79      0.81     17984
weighted avg       0.98      0.98      0.98     17984

Random Forest Confusion Matrix
[[14535     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
