##  Tweets From Senators, Obama, Trump

#### Importing Libraries 

In [2]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

  from numpy.core.umath_tests import inner1d


#### Importing Cleaned Data

In [25]:
data_all = pd.read_csv('../capstone/ALLDATA_2.csv')

df_all = pd.DataFrame(data_all)

df_all.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

In [26]:
print(df_all.shape)
df_all.head(2)

(288799, 9)


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,party,state
0,10/16/17 22:59,I'm grateful to @SenJohnMcCain for his lifetim...,https://twitter.com/BarackObama/status/9200615...,17064,89916,641842,BarackObama,1,US
1,10/2/17 12:41,Michelle & I are praying for the victims in La...,https://twitter.com/BarackObama/status/9148326...,21588,405895,1715753,BarackObama,1,US


### Question: Was this tweet written by a Republican or Democrat? 

In [27]:
# Setting up X and y
X = df_all['text']
y = df_all['party']

# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   random_state = 18, 
                                                   test_size = 0.20, 
                                                   stratify = y
                                                   )

X_train.shape, X_test.shape

((231039,), (57760,))

##### Count Vectorizer & Multinomial Naiive Bayes 

In [6]:
# Instantiating models 

mnb = MultinomialNB()
cv = CountVectorizer()

pipe = Pipeline([
    ('cv', cv), 
    ('mnb', mnb)
])

params = { 
    "cv__stop_words": [None, "english"], 
    "cv__min_df": [1,2,3], 
    "cv__ngram_range": [(1,1), (1,2),(2,2)]
}

gs_mnb = GridSearchCV(pipe, 
                     param_grid = params)

gs_mnb.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv__stop_words': [None, 'english'], 'cv__min_df': [1, 2, 3], 'cv__ngram_range': [(1, 1), (1, 2), (2, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
gs_mnb.best_params_

{'cv__min_df': 1, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}

In [8]:
print("3-fold Cross Validation Score on train data for Multinomial NB:", gs_mnb.best_score_)
print("Score on test data for Multinomial NB:", gs_mnb.score(X_test, y_test))

3-fold Cross Validation Score on train data for Multinomial NB: 0.8404019606597103
Score on test data for Multinomial NB: 0.8479774957211368


In [None]:
##### Count Vectorizer & Random Forest Classifier 

In [4]:
cv2 = CountVectorizer()
rf = RandomForestClassifier()

pipe_rf = Pipeline([ 
    ("cv2", cv2), 
    ("rf", rf)
])

params_rf = { 
    "cv2__stop_words": [None, "english"], 
    "cv2__ngram_range": [(1,1), (1,2), (2,2)] 
    #"rf__criterion": ["gini", "entropy"] 
    #"rf__n_estimators": [20, 27, 35]
}

gs_rf = GridSearchCV(pipe_rf,
                     param_grid = params_rf)
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv2', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv2__stop_words': [None, 'english'], 'cv2__ngram_range': [(1, 1), (1, 2), (2, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [5]:
gs_rf.best_params_

{'cv2__ngram_range': (1, 1), 'cv2__stop_words': 'english'}

In [6]:
gs_rf.best_score_

0.7840859504412332

In [7]:
gs_rf.score(X_test, y_test)

0.7936994797580113

#### Logistic Regression 

In [10]:
from sklearn.linear_model import LogisticRegression

In [28]:
cv3 = CountVectorizer()
logreg = LogisticRegression()

pipe_logreg = Pipeline([
    ('cv3', cv3), 
    ('logreg', logreg)     
])

params_logreg = {
    "cv3__stop_words": ["english"], 
    "cv3__ngram_range": [(1,2), (2,2)], 
    'logreg__penalty': ['l1', 'l2'], 
    "logreg__C": [0.5, 0.7, 1.0]
}

logreg_gs = GridSearchCV(pipe_logreg, 
                        param_grid = params_logreg)

logreg_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv3', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv3__stop_words': ['english'], 'cv3__ngram_range': [(1, 2), (2, 2)], 'logreg__penalty': ['l1', 'l2'], 'logreg__C': [0.5, 0.7, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
logreg_gs.best_params_

{'cv3__ngram_range': (1, 2),
 'cv3__stop_words': 'english',
 'logreg__C': 1.0,
 'logreg__penalty': 'l2'}

In [30]:
logreg_gs.best_score_

0.8612095793350906

In [31]:
logreg_gs.score(X_test, y_test)

0.873303324099723

#### Confusion Matrix 

In [32]:
test_prediction = logreg_gs.predict(X_test)
test_prediction

array([0, 0, 0, ..., 1, 0, 0])

In [35]:
tn, fp, fn, tp = confusion_matrix(y_test, test_prediction).ravel()
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Negatives: %s" % tn)
print("True Positives: %s" % tp)

False Positives: 3183
False Negatives: 4135
True Negatives: 26719
True Positives: 23723


In [36]:
from sklearn.metrics import classification_report

print(classification_report(y_test, test_prediction))

             precision    recall  f1-score   support

          0       0.87      0.89      0.88     29902
          1       0.88      0.85      0.87     27858

avg / total       0.87      0.87      0.87     57760

