In [14]:
import pandas as pd

df = pd.read_csv("datasets/human_classified_sn.csv")

df['pos'] = df.pos - 1
df['neg'] = df.neg - 1
df['target'] = df.pos / (df.pos + df.neg)
df = df[['text', 'target']]
df['target'] = np.where(df.target > 0.5, 1, 0)

In [15]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df)
X_train, Y_train = train.text, train.target
X_validation, Y_validation = validation.text, validation.target

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC

vectorizers = {"count" : CountVectorizer(), "tfidf" : TfidfVectorizer()}
models = {"logistic" : LogisticRegression(max_iter = 1000), "svm" : LinearSVC(), "sgd" : SGDClassifier()}

for vect_name, vect in vectorizers.items():
    for model_name, model in models.items():
        pipe = Pipeline([('vectorizer', vect), ('model', model)])
        cv_score = cross_val_score(pipe, X_train, Y_train, scoring='roc_auc')
        print("({} + {}) score is {}".format(vect_name, model_name, cv_score.mean()))

(count + logistic) score is 0.8167823892449784
(count + svm) score is 0.7903339318123661
(count + sgd) score is 0.7949736301753678
(tfidf + logistic) score is 0.8299357598910738
(tfidf + svm) score is 0.8204873723870802
(tfidf + sgd) score is 0.8209426740584981


In [21]:
from sklearn.linear_model import LogisticRegressionCV

cv_selection = Pipeline([('vectorizer', TfidfVectorizer()), ('model', LogisticRegressionCV(max_iter = 1000, scoring='roc_auc'))])
cv_selection.fit(X_train, Y_train)
model = cv_selection.named_steps['model']
best_C = model.C_[0]
best_C

2.782559402207126

In [29]:
from sklearn.metrics import roc_auc_score

pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('model', LogisticRegression(max_iter = 10000, C=best_C))])
pipe.fit(X_train, Y_train)
print(roc_auc_score(Y_validation, pipe.predict_proba(X_validation)[:,1]))

0.8307653339568233


In [32]:
from pickle import dump
with open("model.pkl", 'wb') as file:
    dump(pipe, file)

In [34]:
pipe.predict_proba(["Hi awesome product"])

array([[0.07768561, 0.92231439]])