# Cleaning the Data and Fitting Models

First, bring in all the needed libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import regex as re
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

%matplotlib inline

A lot of future warnings would otherwise pop up in this notebook, so I am essentially turning them off

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Read in the populated science and comedy data sets

In [4]:
science = pd.read_csv('./data/sciencedata.csv')
comedy = pd.read_csv('./data/comedydata.csv')

Join the two dataframes together

In [5]:
data = pd.concat([science, comedy], axis=0, ignore_index=True)

In [6]:
display(data.shape)
display(data.head())

(2868, 2)

Unnamed: 0,posts,science
0,Fentanyl Surpasses Heroin As Drug Most Often I...,1
1,"In Seattle, Washington, delaying the start tim...",1
2,College textbooks aimed at introductory biolog...,1
3,Scary warming at poles showing up at weird tim...,1
4,An invasive species of tick - the first of its...,1


# Write a function to clean up all the posts at once

In [7]:
from bs4 import BeautifulSoup

In [9]:
#function to clean the data in posts column
def post_to_words(post):
    #grab the text from each post
    post_text = BeautifulSoup(post).get_text()
    
    #use regex to keep the letters and get rid of punctuation
    letters_only = re.sub("[^a-zA-Z]", " ", post_text)
    
    #make all the words lowercase
    words = letters_only.lower().split()
    
    #gather all the stop words (most common words like "a", "and", etc
    stops = set(stopwords.words('english'))
    
    #keep only the non stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return(" ".join(meaningful_words))

In [10]:
#run the above function on all posts in the df
data.posts = data.posts.apply(post_to_words)
display(data.head())

Unnamed: 0,posts,science
0,fentanyl surpasses heroin drug often involved ...,1
1,seattle washington delaying start time two hig...,1
2,college textbooks aimed introductory biology c...,1
3,scary warming poles showing weird times places...,1
4,invasive species tick first kind emerge us yea...,1


# Establish a baseline accuracy

If the model just guessed nothing but non science posts, it would be just a little over 53% accurate.

In [11]:
data.science.value_counts(normalize=True)

0    0.531381
1    0.468619
Name: science, dtype: float64

# Assign feature and target variables

In [12]:
X = data.posts
y = data.science

In [13]:
#split the data into training and testing sets
#stratify to make sure both sets have a similar breakdown of 1 and 0 labels
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                   random_state=42)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
cv = CountVectorizer()        #instantiate count vectorizer to turn our posts into features for the model
log = LogisticRegression()    #instantiate logsitic regression model 

# Create a pipeline for more efficient hyperparameter tuning

In [19]:
from sklearn.pipeline import Pipeline

In [20]:
#pipeline to help gridsearch parameters for
#count vectorizer and logistic regression
pipe = Pipeline([  
    ('cv', cv),
    ('log', log)
])

In [136]:
params = {
    'cv__max_features': [4000, 5000, 6000],
    'cv__max_df': [.1, .2, .3],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'log__penalty': ['l1', 'l2'],
    'log__C': [2, 3, 4],
}

gs = GridSearchCV(pipe, param_grid=params, cv = 3)
gs.fit(X_train, y_train);



In [52]:
#check out the best parameters
gs.best_params_

{'cv__max_df': 0.1,
 'cv__max_features': 5000,
 'cv__ngram_range': (1, 1),
 'log__C': 3,
 'log__penalty': 'l2'}

# Set optimal parameters for the vectorizer and the model

In [21]:
vect = CountVectorizer(max_df=.1, max_features=5000)
log = LogisticRegression(C=3, random_state=42)

Take the array of word features and put them back into a dataframe

In [22]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                            columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                           columns=vect.get_feature_names())

Fit the model and get a column of predicted classes

In [23]:
model = log.fit(X_train_vect, y_train)
predictions = model.predict(X_test_vect)

Score the model on the test data.  This was my highest scoring model.  97.6% accuracy and the other metrics from the confusion matrix are below.

In [24]:
model.score(X_test_vect, y_test)

0.9762900976290098

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [30]:
report = classification_report(y_test, predictions,
                               target_names=['science', 'not_science'])
print(report)

              precision    recall  f1-score   support

     science       0.96      0.99      0.98       381
 not_science       0.99      0.96      0.97       336

   micro avg       0.98      0.98      0.98       717
   macro avg       0.98      0.98      0.98       717
weighted avg       0.98      0.98      0.98       717



In [195]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 378
False Positives: 3
False Negatives: 14
True Positives: 322


# Use the Tf-Idf vectorizer with Random Forest

In [196]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [197]:
tfi = TfidfVectorizer()
rf = RandomForestClassifier()

In [198]:
pipe = Pipeline([
    ('tfi', tfi),
    ('rf', rf)
])

In [66]:
params = {
    'tfi__max_features': [4000, 5000, 6000],
    'tfi__max_df': [.1, .2, .3],
    'rf__max_depth': [170, 180, 190],
    'rf__min_samples_split': [4, 5, 6],
    'rf__n_estimators': [80, 100]
}

gs = GridSearchCV(pipe, param_grid=params, cv=3)
gs.fit(X_train, y_train);

In [67]:
gs.best_params_

{'rf__max_depth': 170,
 'rf__min_samples_split': 4,
 'rf__n_estimators': 100,
 'tfi__max_df': 0.2,
 'tfi__max_features': 6000}

In [199]:
vect = TfidfVectorizer(max_features=6000, max_df=.2)
rf = RandomForestClassifier(max_depth=180, min_samples_split=3,
                            n_estimators=100, random_state=42)

In [200]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                            columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                          columns=vect.get_feature_names())

Fitting and making predictions with the model.  This scored just slightly behind LogReg at 96.9%

In [201]:
model = rf.fit(X_train_vect, y_train)
predictions = model.predict(X_test_vect)

In [202]:
model.score(X_test_vect, y_test)

0.9693165969316597

In [203]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 378
False Positives: 3
False Negatives: 19
True Positives: 317


In [204]:
tfi = TfidfVectorizer()
mnb = MultinomialNB()

In [205]:
pipe = Pipeline([
    ('tfi', tfi),
    ('mnb', mnb)
])

In [85]:
params = {
    'tfi__max_features': [7000, 8000, 9000],
    'tfi__ngram_range':[(1, 1), (1, 2), (1, 3)],
    'tfi__max_df': [.05, .1, .2],
    'tfi__norm': ['l1', 'l2', None],
    'tfi__binary': [False, True],
    'mnb__alpha': [0.1, 0.2, 0.3]
}
gs = GridSearchCV(pipe, param_grid=params, cv=5)
gs.fit(X_train, y_train);


In [86]:
gs.best_params_

{'mnb__alpha': 0.3,
 'tfi__binary': True,
 'tfi__max_df': 0.1,
 'tfi__max_features': 8000,
 'tfi__ngram_range': (1, 2),
 'tfi__norm': 'l2'}

# Trying out MultiNomial Naive Bayes, Bagging Classifier, and Ada Boost as well
- Repeat the hyperparameter tuning, fitting, and scoring processes below

In [206]:
vect = TfidfVectorizer(max_features=8000, binary=True, max_df=.1,
                      ngram_range=(1,2), norm='l2')

mnb = MultinomialNB(alpha=.4)

In [207]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                           columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                          columns=vect.get_feature_names())

In [208]:
model = mnb.fit(X_train_vect, y_train)
predict = model.predict(X_test_vect)

In [209]:
model.score(X_test_vect, y_test)

0.9651324965132496

In [210]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 378
False Positives: 3
False Negatives: 19
True Positives: 317


In [211]:
from sklearn.neighbors import KNeighborsClassifier

In [212]:
cv = CountVectorizer()
bag = BaggingClassifier()

In [213]:
pipe = Pipeline([
    ('cv', cv),
    ('bag', bag)
])

In [67]:
params = {
    'cv__max_features': [750, 1000],
    'cv__max_df': [.4, .5, .6],
    'cv__ngram_range': [(1,1), (1, 2), (1, 3)],
    'bag__n_estimators': [125, 150]
}

gs = GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train);

In [68]:
gs.best_params_

{'bag__n_estimators': 125,
 'cv__max_df': 0.5,
 'cv__max_features': 1000,
 'cv__ngram_range': (1, 3)}

In [214]:
vect = CountVectorizer(max_df=.4, max_features=1000)
bag = BaggingClassifier(n_estimators=125, base_estimator=DecisionTreeClassifier())

In [215]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                           columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                          columns=vect.get_feature_names())

In [216]:
model = bag.fit(X_train_vect, y_train)
predictions = model.predict(X_test_vect)

In [217]:
model.score(X_test_vect, y_test)

0.9595536959553695

In [218]:
from sklearn.ensemble import AdaBoostClassifier

In [219]:
cv = CountVectorizer()
ada = AdaBoostClassifier()

In [220]:
pipe = Pipeline([
    ('cv', cv),
    ('ada', ada)
])

In [80]:
params = {
    'cv__max_features': [1500, 2000, 3000, 4000],
    'cv__max_df': [.4, .5, .6],
    'ada__n_estimators': [150, 200, 250]
}

gs = GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train);

In [81]:
gs.best_params_

{'ada__n_estimators': 250, 'cv__max_df': 0.6, 'cv__max_features': 2000}

In [221]:
vect = CountVectorizer(max_features=2000, max_df=.6)
ada = AdaBoostClassifier(n_estimators=250, random_state=42)

In [222]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                            columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                           columns=vect.get_feature_names())

In [223]:
model = ada.fit(X_train_vect, y_train)
predictions = model.predict(X_test_vect)

Ada Boost gave me my second highest accuracy score, but I ended up including Logistic Regression and Random Forest as my two models to display.  I chose Random Forest because it takes slightly less time to run and Logistic Regression because it score highest and is a little easier to interpret.

In [225]:
model.score(X_test_vect, y_test)

0.9735006973500697

In [226]:
tfi = TfidfVectorizer()
ada = AdaBoostClassifier()

In [107]:
pipe = Pipeline([
    ('tfi', tfi),
    ('ada', ada)
])

In [111]:
params = {
    'tfi__max_features': [3000, 4000, 5000],
    'tfi__max_df': [.3, .4, .5],
    'ada__n_estimators': [250, 300]
}

gs = GridSearchCV(pipe, param_grid=params)
gs.fit(X_train, y_train);

In [112]:
gs.best_params_

{'ada__n_estimators': 250, 'tfi__max_df': 0.3, 'tfi__max_features': 4000}

In [227]:
vect = TfidfVectorizer(max_features=4000, max_df=.3)
ada = AdaBoostClassifier(n_estimators=250)

In [228]:
X_train_vect = pd.DataFrame(vect.fit_transform(X_train).todense(),
                           columns=vect.get_feature_names())

X_test_vect = pd.DataFrame(vect.transform(X_test).todense(),
                          columns=vect.get_feature_names())

In [229]:
model = ada.fit(X_train_vect, y_train)
predictions = model.predict(X_test_vect)

In [230]:
model.score(X_test_vect, y_test)

0.9456066945606695

In [231]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 364
False Positives: 17
False Negatives: 22
True Positives: 314
