# Modeling

## Baseline Models

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC

In [6]:
def get_model_metrics(X_train, y_train, X_test, y_test, model, model_name, data_name):
    
    model.fit(X_train, y_train)
    y_train_hat = model.predict(X_train)
    y_test_hat = model.predict(X_test)
    
    acc_train = accuracy_score(y_train, y_train_hat)
    pre_train = precision_score(y_train, y_train_hat)
    rec_train = recall_score(y_train, y_train_hat)
    
    acc_test = accuracy_score(y_test, y_test_hat)
    pre_test = precision_score(y_test, y_test_hat)
    rec_test = recall_score(y_test, y_test_hat)
    
    metrics = {'Model': model_name,
               'Processing': data_name,
               'Test Accuracy': acc_test,
               'Test Precision': pre_test,
               'Test Recall': rec_test,
               'Train Accuracy': acc_train,
               'Train Precision': pre_train,
               'Train Recall': rec_train}
    
    return metrics

In [None]:
datasets = [('TF-IDF', 'tf'),
        ('TF-IDF with Bigrams', 'bigram'),
        ('Document Embeddings', 'embed')]
models = [('Logistic Regression', LogisticRegression(solver='saga')),
          ('Multinomial Naive Bayes', MultinomialNB()),
          ('Random Forest', RandomForestClassifier())]
metrics = []

In [None]:
y_train = pd.read_feather('data/processed/y_train.feather')['voted_up'].to_numpy()
y_test = pd.read_feather('data/processed/y_test.feather')['voted_up'].to_numpy()

for data_name, file in datasets:
    X_train = pd.read_feather(f'data/processed/X_train_{file}.feather').to_numpy()
    X_test = pd.read_feather(f'data/processed/X_test_{file}.feather').to_numpy()
    for model_name, model in models:
        print(model_name, data_name)
        metrics.append(get_model_metrics(X_train, y_train, X_test, y_test, model, model_name, data_name))

metrics.append(get_model_metrics(X_train, y_train, X_test, y_test, DummyClassifier()))

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df.sort_values(by='Test Accuracy', ascending=False)

# Gridsearch

In [2]:
from sklearn.model_selection import GridSearchCV

In [3]:
y_train = pd.read_feather('data/processed/y_train.feather')['voted_up'].to_numpy()
X_train = pd.read_feather(f'data/processed/X_train_bigram.feather').to_numpy()

In [4]:
param_grid_lr = {'C': [0.1, 1, 10],
                 'class_weight': ['balanced', None],
                 'solver': ['saga']}
gs_lr = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, scoring='accuracy', verbose=1)
gs_lr.fit(X_train, y_train)
gs_lr.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'C': 10, 'class_weight': None, 'solver': 'saga'}

In [5]:
param_grid_rf = {'n_estimators': [100, 250],
                 'max_features': ['auto', 150],
                 'class_weight': ['balanced', None]}
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, scoring='accuracy', cv=3, verbose=5)
gs_rf.fit(X_train, y_train)
gs_rf.best_params_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END class_weight=balanced, max_features=auto, n_estimators=100; total time= 5.8min
[CV 2/3] END class_weight=balanced, max_features=auto, n_estimators=100; total time= 5.8min
[CV 3/3] END class_weight=balanced, max_features=auto, n_estimators=100; total time= 5.6min
[CV 1/3] END class_weight=balanced, max_features=auto, n_estimators=250; total time=13.7min
[CV 2/3] END class_weight=balanced, max_features=auto, n_estimators=250; total time=13.8min
[CV 3/3] END class_weight=balanced, max_features=auto, n_estimators=250; total time=13.5min
[CV 1/3] END class_weight=balanced, max_features=150, n_estimators=100; total time= 8.2min
[CV 2/3] END class_weight=balanced, max_features=150, n_estimators=100; total time= 8.3min
[CV 3/3] END class_weight=balanced, max_features=150, n_estimators=100; total time= 8.2min
[CV 1/3] END class_weight=balanced, max_features=150, n_estimators=250; total time=20.3min
[CV 2/3] END class_weigh

{'class_weight': None, 'max_features': 150, 'n_estimators': 100}

## Final Models

In [3]:
y_train = pd.read_feather('data/processed/y_train.feather')['voted_up'].to_numpy()
y_test = pd.read_feather('data/processed/y_test.feather')['voted_up'].to_numpy()
X_train = pd.read_feather(f'data/processed/X_train_bigram.feather').to_numpy()
X_test = pd.read_feather(f'data/processed/X_test_bigram.feather').to_numpy()

In [7]:
lr_final = LogisticRegression(C=10, solver='saga')
nb_final = MultinomialNB()
rf_final = RandomForestClassifier(max_features=150)

final_metrics = []
print('starting Logistic Regression model')
final_metrics.append(get_model_metrics(X_train, y_train, X_test, y_test, lr_final, 'Logistic Regression', 'TF-IDF with Bigrams'))
print('starting Naive Bayes model')
final_metrics.append(get_model_metrics(X_train, y_train, X_test, y_test, nb_final, 'Multinomial Naive Bayes', 'TF-IDF with Bigrams'))
print('starting Random Forest model')
final_metrics.append(get_model_metrics(X_train, y_train, X_test, y_test, rf_final, 'Random Forest', 'TF-IDF with Bigrams'))
print('completed models')

final_metrics_df = pd.DataFrame(final_metrics)
final_metrics_df.sort_values(by='Test Accuracy', ascending=False)

KeyboardInterrupt: 