In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import math

In [None]:
#Read in cvs and assigned it to a dataframe
df_all = pd.read_csv('./data/all_clean.csv')

In [None]:
#Created X and y
X = df_all['full_text']
y = df_all['subreddit']



# Calculated the count and percentage breakdown of subreddits (y).
# 0 = jokes
# 1 = dad jokes

In [None]:
y.value_counts()/len(y)

In [None]:
y.value_counts()

# The dataframe is unbalance. There will be a bias to predicting 0 (jokes).


# Logistic Regression Model

In [None]:
#Created a modified stopword list to include certain contractions.
new_stopwords = set(stopwords.words('english')) - {"can't","don't","i'm","let's","you're","i'll",
                                                   "we'll","wouldn't","who's","should've","could've","isn't","hasn't",
                                                  "aren't","haven't","has'nt","o'clock","ma'am","mustn't",
                                                  "how'd","how's","didn't","you'll","she'll","he'll","it'll","they'll",
                                                  }

In [None]:
def my_tokenizer(s):
    return s.split()

In [None]:
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = my_tokenizer,
                             preprocessor = None,
                             stop_words = new_stopwords, max_features=2000)

In [None]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#transform X and create array of words
X_train_features = cvec.fit_transform(X_train)
X_test_features = cvec.transform(X_test)
X_train_array  = X_train_features.toarray()
X_test_array = X_test_features.toarray()

In [None]:
y_test.value_counts()/len(y_test)

In [None]:
y_test.value_counts()

In [None]:
#Instantiated logistic regression model
lr = LogisticRegression(random_state=42)

In [None]:
#Fit to the model to training data
lr.fit(X_train_features, y_train)

In [None]:
#Scored against training data
lr.score(X_train_features, y_train)

In [None]:
#Scored against test data
lr.score(X_test_features, y_test)

In [None]:
#Created a predictions variable
pred = lr.predict(X_test_features)

In [None]:
#Ran an accuracy score
print("Accuracy:",metrics.accuracy_score(y_test, pred))

# Confusion Matrix

In [None]:
#Based partially on code idea via Google
cm = confusion_matrix(y_test,pred) #Assigned matrix
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel() #Broke out elements of matrix

In [None]:
print('True Negatives: ', tn)
print('False Positives: ', fp)
print('False Negatives: ', fn)
print('True Positives: ', tp)

In [None]:
#Print out classification metrics
accuracy = round((((tp+tn)/(tp+fn+tn+fp))*100),2)
misclassification = round(((1 - (accuracy/100))*100),2)
sensitivy = round(((tp/(tp+fn))*100),2)
specificity = round(((tn/(tn+fp))*100),2)
precision = round(((tp/(tp+fp))*100),2)
print('Accuracy Rate:',accuracy,'%')
print('Misclassification Rate:',misclassification,'%')
print('Sensitivy Rate:',sensitivy,'%')
print('Specificity Rate:',specificity,'%')
print('Precision Rate:',precision,'%')

In [None]:
#Created a dataframe of the confusion matrix and printed it out.
cm_df = pd.DataFrame(cm, columns=['pred negative', 'pred positive'], index=['actual negative', 'actual positive'])
cm_df

# The model has bias to predicting negatives (0) which are Jokes

# Randomized the predictions.  The original data is split 60/40 so I reversed it.

In [None]:
simulated_preds = [np.random.choice([0,1],1, p=[0.4,0.6])[0] for _ in range(len(pred))]

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, simulated_preds))

# The random model is less accuarate than the model I created

In [None]:
#Confusion matrix
cm = confusion_matrix(y_test,simulated_preds)
tn, fp, fn, tp = confusion_matrix(y_test, simulated_preds).ravel()

In [None]:
print('True Negatives: ', tn)
print('False Positives: ', fp)
print('False Negatives: ', fn)
print('True Positives: ', tp)

In [None]:
#Classification metrics
accuracy = round((((tp+tn)/(tp+fn+tn+fp))*100),2)
misclassification = round(((1 - (accuracy/100))*100),2)
sensitivy = round(((tp/(tp+fn))*100),2)
specificity = round(((tn/(tn+fp))*100),2)
precision = round(((tp/(tp+fp))*100),2)
print('Accuracy Rate:',accuracy,'%')
print('Misclassification Rate:',misclassification,'%')
print('Sensitivy Rate:',sensitivy,'%')
print('Specificity Rate:',specificity,'%')
print('Precision Rate:',precision,'%')

In [None]:
cm_df = pd.DataFrame(cm, columns=['pred negative', 'pred positive'], index=['actual negative', 'actual positive'])
cm_df

# Grid Search Model

In [None]:
#Established pipeline
pipe = Pipeline([('cvec', CountVectorizer()),
                 ('lr', LogisticRegression())])

In [None]:
#Created model with many parameters
pipe_params = {
    'cvec__max_features': [3000, 4000, 5000,6000,7000,8000],
    'cvec__min_df': [1,2,3,4,5],
    'cvec__max_df': [0.5,0.6,0.7],
    'cvec__ngram_range': [(1,1), (1,2),(1,3)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs=5,verbose=2)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
#Scored model against traing data
gs.score(X_train, y_train)

In [None]:
#Scored model against test data
gs.score(X_test, y_test)

In [None]:
pred = gs.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred))

In [None]:
#Confusion matrix
cm = confusion_matrix(y_test,pred)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [None]:
print('True Negatives: ', tn)
print('False Positives: ', fp)
print('False Negatives: ', fn)
print('True Positives: ', tp)

In [None]:
#Classification metrics
accuracy = round((((tp+tn)/(tp+fn+tn+fp))*100),2)
misclassification = round(((1 - (accuracy/100))*100),2)
sensitivy = round(((tp/(tp+fn))*100),2)
specificity = round(((tn/(tn+fp))*100),2)
precision = round(((tp/(tp+fp))*100),2)
print('Accuracy Rate:',accuracy,'%')
print('Misclassification Rate:',misclassification,'%')
print('Sensitivy Rate:',sensitivy,'%')
print('Specificity Rate:',specificity,'%')
print('Precision Rate:',precision,'%')

In [None]:
cm_df = pd.DataFrame(cm, columns=['pred negative', 'pred positive'], index=['actual negative', 'actual positive'])
cm_df

# Naive Bayes model

In [None]:
#Based on code idea I googled.

In [None]:
#Created model
mnb = MultinomialNB()

pipe = Pipeline([
    ('cvec', cvec),
    ('mnb', mnb)
])

In [None]:
pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

In [None]:
pred = mnb.predict(X_test_array)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, pred))

In [None]:
#Confusion matrix
cm = confusion_matrix(y_test,pred)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [None]:
print('True Negatives: ', tn)
print('False Positives: ', fp)
print('False Negatives: ', fn)
print('True Positives: ', tp)

In [None]:
#Classification metrics
accuracy = round((((tp+tn)/(tp+fn+tn+fp))*100),2)
misclassification = round(((1 - (accuracy/100))*100),2)
sensitivy = round(((tp/(tp+fn))*100),2)
specificity = round(((tn/(tn+fp))*100),2)
precision = round(((tp/(tp+fp))*100),2)
print('Accuracy Rate:',accuracy,'%')
print('Misclassification Rate:',misclassification,'%')
print('Sensitivy Rate:',sensitivy,'%')
print('Specificity Rate:',specificity,'%')
print('Precision Rate:',precision,'%')

In [None]:
cm_df = pd.DataFrame(cm, columns=['pred negative', 'pred positive'], index=['actual negative', 'actual positive'])
cm_df

# Chart of the top 10 ten words in each subreddit.

In [None]:
#Based on code idea from Google
dad_words = pd.DataFrame(X_train_features.todense(), 
                          columns=cvec.get_feature_names()).reindex(y_train[y_train == 1].index)
top_dad_word = dad_words.mean().sort_values(ascending=False).iloc[:10]

In [None]:
top_dad_word = top_dad_word.sort_values(ascending=True)
plt.figure(figsize=(15,10))
plt.barh(top_dad_word.index, top_dad_word.values,height=.5,color=['red', 'green', 'navy', 'orange'])
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.title('Dad Word Frequency', size=18);

In [None]:
joke_words = pd.DataFrame(X_train_features.todense(), 
                          columns=cvec.get_feature_names()).reindex(y_train[y_train == 0].index)
top_joke_word = joke_words.mean().sort_values(ascending=False).iloc[:10]

In [None]:
top_joke_word = top_joke_word.sort_values(ascending=True)
plt.figure(figsize=(15,10))
plt.barh(top_joke_word.index, top_joke_word.values,height=.5,color=['red', 'green', 'navy', 'orange'])
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.title('Joke Word Frequency', size=18);

# There is signifigant overlap of the top ten words between the subrebbits.