In [26]:
import pandas as pd
from classifiers.random_forest import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# I get the dataframe
df = pd.read_csv('dataframe.csv', encoding = 'utf-8')

# I create x and y
x = df.drop('common_answer', axis=1)
y = df['common_answer']

# I get the train set and the test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# I use the classifier
classifier = RandomForest()

# I create the model
classifier.create_model(x_train, y_train)

In [27]:
# Here we test the model

# I get the predictions on the test set
pred = classifier.predict(x_test)

#Let's see how well has done
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

             precision    recall  f1-score   support

          0       0.75      0.57      0.65        87
          1       0.87      0.93      0.90       255

avg / total       0.84      0.84      0.83       342

[[ 50  37]
 [ 17 238]]


In [3]:
# I save the model to file (don't write the file extension)
classifier.save_model_to_file('models/random_forest')

In [4]:
# I add the prediction to all the rows, then I add the y_test and
# drop the NaN (we will find Nan only in y_test and only in the train set rows)

# In this way I can compare the results and try to undestand what's going on
df['pred'] = classifier.predict(x)
df['y_test'] = y_test
df.dropna()

Unnamed: 0,common_answer,first_name_equality,organization_similarity,email_equality,year_difference,last_name_length,initials_equality,pred,y_test
1,1,1.000000,0.000000,0,3,5,1,1,1.0
2,1,1.000000,0.000000,0,12,7,1,1,1.0
5,1,1.000000,0.250000,0,8,8,1,1,1.0
8,0,0.100000,0.666667,0,12,5,0,0,0.0
14,1,1.000000,0.000000,0,2,6,1,1,1.0
15,1,1.000000,0.000000,0,11,15,1,1,1.0
18,1,1.000000,0.000000,0,9,4,1,1,1.0
27,1,1.000000,0.000000,0,7,7,1,1,1.0
30,1,1.000000,0.692308,0,5,9,1,1,1.0
31,0,0.142857,0.000000,0,23,8,1,1,0.0


In [5]:

[item for item in df['email_equality'] if item>=1]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]