In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Tweet classification - Trudeau vs Trump

**Goal** is to create a model that predicts Donald Trump or Justin Trudeau tweets

In [2]:
# load the data
path = '/Users/liamhettinger/Documents/Portfolio_work/Data/twitter.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,timestamp,text,user
0,2020-03-02 23:06:03,"WOW! Thank you, just landed, see everyone soon...",realDonaldTrump
1,2020-03-02 21:47:49,Departing for the Great State of North Carolin...,realDonaldTrump
2,2020-03-02 21:32:54,They are staging a coup against Bernie!,realDonaldTrump
3,2020-03-02 19:55:40,THANK YOU!https://www.breitbart.com/tech/2020/...,realDonaldTrump
4,2020-03-02 19:55:07,Michelle @FischbachMN7 is running for Congress...,realDonaldTrump


In [3]:
#cleaning text lowercase all text data
for text in data.text:
    text = text.lower()

In [4]:
#defining X and Y
y= data.user
X= data.text

In [5]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

**CountVectorizer** turns words into a count matrix. 
- ngrams means x words are grouped and counted.
- max_features defines how many features go into the matrix

In [6]:
#creating pipeline
pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=5000,ngram_range=(1,2))), 
    ('clf', MultinomialNB()) #naive bayes formula
])

In [7]:
#fitting pipeline
pipe.fit(X_train,y_train)

In [8]:
#predicting y_test
y_test_pred = pipe.predict(X_test)

In [9]:
#model's accuracy score
accuracy_score(y_test,y_test_pred)

0.9660377358490566

In [10]:
#models confusion matrix
confusion_matrix(y_test,y_test_pred)

array([[ 99,   4],
       [  5, 157]])

In [11]:
#X variables after count vectorizer
features = pipe['vect'].get_feature_names_out()

#target variables
pipe['clf'].classes_

array(['JustinTrudeau', 'realDonaldTrump'], dtype='<U15')

In [12]:
#influential features/words in model
JustinTrudeau = pipe['clf'].feature_count_[0,:]
realDonaldTrump = pipe['clf'].feature_count_[1,:]
tweets = pd.DataFrame({'feature':features, 'JustinTrudeau':JustinTrudeau, 'DonaldTrump':realDonaldTrump}).set_index('feature')
tweets.head()

Unnamed: 0_level_0,JustinTrudeau,DonaldTrump
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,13.0
00 eastern,0.0,8.0
00 the,0.0,2.0
000,12.0,11.0
000 americans,0.0,2.0


In [13]:
# words that indicate trump or trudeau tweet.
tweets.DonaldTrump = tweets.DonaldTrump+1
tweets.JustinTrudeau = tweets.JustinTrudeau+1
tweets.DonaldTrump = tweets.DonaldTrump/tweets.DonaldTrump.sum()
tweets.JustinTrudeau = tweets.JustinTrudeau/tweets.JustinTrudeau.sum()
tweets['TrumpVsTrudeau'] = tweets.DonaldTrump/tweets.JustinTrudeau
tweets['TrudeauVsTrump'] = tweets.JustinTrudeau/tweets.DonaldTrump

In [14]:
#Trudeau common words
tweets.sort_values(by='TrudeauVsTrump', ascending=False).head(10)

Unnamed: 0_level_0,JustinTrudeau,DonaldTrump,TrumpVsTrudeau,TrudeauVsTrump
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ca,0.003534,4.2e-05,0.01177,84.96443
en,0.00334,4.2e-05,0.012454,80.296055
pm,0.00334,4.2e-05,0.012454,80.296055
ca en,0.00334,4.2e-05,0.012454,80.296055
ll,0.002213,4.2e-05,0.01879,53.219478
gc ca,0.002213,4.2e-05,0.01879,53.219478
gc,0.002213,4.2e-05,0.01879,53.219478
pm gc,0.002175,4.2e-05,0.019126,52.285803
en news,0.002175,4.2e-05,0.019126,52.285803
https pm,0.002097,4.2e-05,0.019834,50.418453


In [15]:
#Trump common word
tweets.sort_values(by='TrumpVsTrudeau', ascending=False).head(10)

Unnamed: 0_level_0,JustinTrudeau,DonaldTrump,TrumpVsTrudeau,TrudeauVsTrump
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fake news,3.9e-05,0.002038,52.480785,0.019055
white house,3.9e-05,0.001248,32.131093,0.031123
white,3.9e-05,0.001248,32.131093,0.031123
fake,7.8e-05,0.002371,30.524538,0.032761
media,3.9e-05,0.001123,28.917984,0.034581
conference,3.9e-05,0.001123,28.917984,0.034581
it is,3.9e-05,0.00104,26.775911,0.037347
the fake,3.9e-05,0.000957,24.633838,0.040595
democrats,3.9e-05,0.000957,24.633838,0.040595
joe,3.9e-05,0.000915,23.562802,0.04244
