In [25]:
import joblib
import pandas as pd
import sklearn
from sklearn import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from utils import get_features_from_df, cast_list_as_strings

In [22]:
# use this to train and VALIDATE your solution
train_df = pd.read_csv("./data/quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./data/quora_test_data.csv")

q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))
all_questions = q1_train + q2_train

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

In [None]:
X_tr_q1q2 = get_features_from_df(train_df,count_vectorizer)
X_te_q1q2  = get_features_from_df(test_df, count_vectorizer)

y_train = train_df["is_duplicate"].values

In [None]:
perceptron = joblib.load("perceptron_model.joblib", mmap_mode=None)
y_train_pred = perceptron.predict(X_tr_q1q2)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)
roc_auc = roc_auc_score(y_train, perceptron.decision_function(X_tr_q1q2))

print("Perceptron Metrics")
print("Train Accuracy: {:.4f}".format(accuracy))
print("Train Precision: {:.4f}".format(precision))
print("Train Recall: {:.4f}".format(recall))
print("Train F1-score: {:.4f}".format(f1))
print("Train ROC AUC: {:.4f}".format(roc_auc))

In [None]:
logistic = joblib.load("logistic_model.joblib", mmap_mode=None)
y_train_pred = logistic.predict(X_tr_q1q2)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)
roc_auc = roc_auc_score(y_train, logistic.predict_proba(X_tr_q1q2)[:, 1])

print("Logistic Regression Metrics")
print("Train Accuracy: {:.4f}".format(accuracy))
print("Train Precision: {:.4f}".format(precision))
print("Train Recall: {:.4f}".format(recall))
print("Train F1-score: {:.4f}".format(f1))
print("Train ROC AUC: {:.4f}".format(roc_auc))