# Modelling the Data

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=100
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, scorer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

In [18]:
np.random.seed(42)

In [19]:
df = pd.read_csv("./data/clean_eda_model_data"])

---

# Basic Logistic Regression Model

In [46]:
# Define features and TTS
X = df['post_title']
y = df['subred_binom']

# Train, Test, Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=42)

# Set up a piple line for our models
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

- Remember our baseline rate is 56.51% for the model (1 = The_Donald)

In [47]:
# Gauging an un-tuned Logarithmic Regression model
print(f'cross_val_score = {cross_val_score(pipe, X_train, y_train, cv=3).mean()}\n')

pipe.fit(X_train, y_train)

print(f'''Training score = {pipe.score(X_train, y_train)}'

Test score = {pipe.score(X_test, y_test)}

Log Model overfit by {(pipe.score(X_test, y_test) - pipe.score(X_train, y_train))}''')



cross_val_score = 0.7870370370370371

Training score = 0.9915123456790124'

Test score = 0.8106235565819861

Log Model overfit by -0.18088878909702621


- A un-tuned Logistic Regression model is overfit by 0.1819 (train (0.9915123456790124) - test (0.8106235565819861))

---

## Add Gridsearch to a LogisticRegression Model

In [72]:
# Define features and TTS
X = df['post_title']
y = df['subred_binom']

# Train, Test, Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=42)

# Set up a piple line for our models
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(random_state= 42))
])

pipe_params = {
    'cvec__stop_words': [None],
    'cvec__max_features': [1729],
    'cvec__min_df': [1, 2],
    'cvec__max_df': [.9, .8, .95, .99],
    'cvec__ngram_range': [(1,1)],
    'lr__solver': ['sag'],
    'lr__max_iter': [500]
                   }

gscv_mod = GridSearchCV(pipe, param_grid = pipe_params, cv=3)
gscv_mod.fit(X_train, y_train)
print(gscv_mod.best_score_)
gscv_mod.best_params_

0.7839506172839507


{'cvec__max_df': 0.9,
 'cvec__max_features': 1729,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None,
 'lr__max_iter': 500,
 'lr__solver': 'sag'}

In [73]:
print(f'''training score = {gscv_mod.score(X_train, y_train)}

test score = {gscv_mod.score(X_test, y_test)} 

Log Model overfit by {(gscv_mod.score(X_test, y_test) - gscv_mod.score(X_train, y_train))}''')

training score = 0.970679012345679

test score = 0.8175519630484989 

Log Model overfit by -0.15312704929718013


Through Pipline & GridSearchCV tuning, the best training score that I was able to obtain was 0.78395. I found the best parameters for "cvec" and "lr" to be:
- 'cvec__max_df': 0.9
- 'cvec__max_features': 1729
- 'cvec__min_df': 1
- 'cvec__ngram_range': (1, 1
- 'cvec__stop_words': None
- 'lr__max_iter': 500
- 'lr__solver': 'sag'

In [74]:
predictions = gscv_mod.predict(X_test)

In [75]:
confusion_matrix(y_test, predictions)

array([[148,  52],
       [ 27, 206]], dtype=int64)

### Create a function to calculate and return our classification metrics for confusion matrices

In [76]:
def class_metrics(tp, tn, fn, fp):
    accuracy = (tp + tn) / (tp + tn + fn +fp)
    missclass = (fn + fp) / (tp + tn + fn + fp)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    
    print(f'Accuracy = {accuracy}\nMissclassification = {missclass}\nSensitivity = {sensitivity}\nSpecificity = {specificity}\nPrecision = {precision}')
    
    return accuracy, missclass, sensitivity, specificity, precision

In [77]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [79]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 148
False Positives: 52
False Negatives: 27
True Positives: 206


In [80]:
class_metrics(tp, tn, fn, fp)

Accuracy = 0.8175519630484989
Missclassification = 0.18244803695150116
Sensitivity = 0.8841201716738197
Specificity = 0.74
Precision = 0.7984496124031008


(0.8175519630484989,
 0.18244803695150116,
 0.8841201716738197,
 0.74,
 0.7984496124031008)

-------

# Naive Bayes Model

In [55]:
# Define features and TTS
X = df['post_title']
y = df['subred_binom']

# Train, Test, Split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y) 

# Set up a piple line for our models
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe_params = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [1729],
    'cvec__min_df': [1],
    'cvec__max_df': [.25],
    'cvec__ngram_range': [(1,1)]
                   }

gscv_mod = GridSearchCV(pipe, param_grid = pipe_params, cv=3)
gscv_mod.fit(X_train, y_train)
print(gscv_mod.best_score_)
gscv_mod.best_params_

0.7878086419753086


{'cvec__max_df': 0.25,
 'cvec__max_features': 1729,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [56]:
gscv_mod.score(X_train, y_train)

0.9243827160493827

In [57]:
gscv_mod.score(X_test, y_test)

0.792147806004619

In [58]:
print(f'''training score = {gscv_mod.score(X_train, y_train)}

test score = {gscv_mod.score(X_test, y_test)} 

Log Model overfit by {(gscv_mod.score(X_test, y_test) - gscv_mod.score(X_train, y_train))}''')

training score = 0.9243827160493827

test score = 0.792147806004619 

Log Model overfit by -0.13223491004476373


In [59]:
predictions = gscv_mod.predict(X_test)

In [60]:
confusion_matrix(y_test, predictions)

array([[148,  40],
       [ 50, 195]], dtype=int64)

In [61]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [62]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 148
False Positives: 40
False Negatives: 50
True Positives: 195


In [71]:
class_metrics(tp, tn, fn, fp)

Accuracy = 0.792147806004619
Missclassification = 0.20785219399538107
Sensitivity = 0.7959183673469388
Specificity = 0.7872340425531915
Precision = 0.8297872340425532


(0.792147806004619,
 0.20785219399538107,
 0.7959183673469388,
 0.7872340425531915,
 0.8297872340425532)