### Import libraries

In [1]:
import tweepy
import csv
import pandas as pd
import re
import emoji
import nltk
import string
import numpy as np

In [2]:
# Original
df0 = pd.read_csv("train_basic.csv")
# Train
df = pd.read_csv("train_basic.csv")
# Test
df2 = pd.read_csv("test_basic.csv")

# List of unique labels
tags = list(df['label'].unique())

In [3]:
df.head()

Unnamed: 0,tweet,label
0,silencing blm : priti patel\xe2\x80\x99s anti-...,BLM
1,"trillian42_ johnbok5 nadiawhittomemp ""\'silly ...",BLM
2,"rt errolwebber: tell me, would this be conside...",BLM
3,apple won't let parler have an app but still k...,BLM
4,malika_andrews wojespn can we get jlm trending...,BLM


In [4]:
# Drop NaN
df.dropna(inplace=True)

#Removes Punctuation
df['tweet'] = df['tweet'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', str(x)))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', str(x)))
#Removes hyperlinks
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
#Remove newlines
df['tweet'] = df['tweet'].apply(lambda x: x.strip().replace('\\n', ''))
#Remove encoded characters
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'(\\x(.){2})', '', str(x)))
#Removes ' , ", and blank space
df['tweet'] = df['tweet'].apply(lambda x: x.replace("'", ""))
df['tweet'] = df['tweet'].apply(lambda x: x.replace('"', ""))
df['tweet'] = df['tweet'].apply(lambda x: x.strip())
#Remove escape charcters but keep emoji codes
df['tweet'] = df['tweet'].apply(lambda x: re.sub("\\\\", '', str(x)))
# Remove 'rt', retweet string. Remove newline character
df['tweet'] = df['tweet'].apply(lambda x: x.strip("rt ").replace('\n', ''))

# Remove punctuation again, remove hyphens
exclude = set(string.punctuation)
df['tweet'] = df['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

### Tokenize words

In [5]:
from nltk import word_tokenize

In [6]:
df['token'] = df['tweet'].apply(word_tokenize)

In [7]:
df['token'][0]

['silencing',
 'blm',
 'priti',
 'patels',
 'antiprotest',
 'law',
 'iandunt',
 'on',
 'how',
 'a',
 'government',
 'keen',
 'to',
 'tackle',
 'cancel',
 'culture',
 'when']

### Remove Stop Words

In [8]:
from nltk.corpus import stopwords

In [9]:
sw = stopwords.words('english')
df['token'] = df['token'].apply(lambda x:[word for word in x if word not in sw])

In [10]:
df['token']

0        [silencing, blm, priti, patels, antiprotest, l...
1        [illian42, johnbok5, nadiawhittomemp, silly, l...
2        [errolwebber, tell, would, considered, racistd...
3        [apple, wont, let, parler, app, still, keeps, ...
4        [malikaandrews, wojespn, get, jlm, trending, m...
                               ...                        
26395    [hey, brainwashed, base, generations, im, stil...
26396    [56blackcat, neanderthals, blm, blacklivesmatt...
26397    [atrupar, 2, dear, jaketapper, instead, airing...
26398    [bombshellmarjorie, greene, may, genius, behin...
26399    [omthunkitsmind, lets, talk, white, supremacyc...
Name: token, Length: 24329, dtype: object

### Create Stem Words from Token 

In [11]:
from nltk.stem.snowball import SnowballStemmer

In [12]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
df['stem'] =  df['token'].apply(lambda x: [stemmer.stem(i) for i in x])

In [13]:
df['stem'][0]

['silenc',
 'blm',
 'priti',
 'patel',
 'antiprotest',
 'law',
 'iandunt',
 'govern',
 'keen',
 'tackl',
 'cancel',
 'cultur']

### Pipelines

In [14]:
df['stem_full'] = df['stem'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,tweet,label,token,stem,stem_full
0,silencing blm priti patels antiprotest law i...,BLM,"[silencing, blm, priti, patels, antiprotest, l...","[silenc, blm, priti, patel, antiprotest, law, ...",silenc blm priti patel antiprotest law iandunt...
1,illian42 johnbok5 nadiawhittomemp silly little...,BLM,"[illian42, johnbok5, nadiawhittomemp, silly, l...","[illian42, johnbok5, nadiawhittomemp, silli, l...",illian42 johnbok5 nadiawhittomemp silli littl ...
2,errolwebber tell me would this be considered r...,BLM,"[errolwebber, tell, would, considered, racistd...","[errolwebb, tell, would, consid, racistdo, peo...",errolwebb tell would consid racistdo peopl sup...
3,apple wont let parler have an app but still ke...,BLM,"[apple, wont, let, parler, app, still, keeps, ...","[appl, wont, let, parler, app, still, keep, tw...",appl wont let parler app still keep twitter al...
4,malikaandrews wojespn can we get jlm trending ...,BLM,"[malikaandrews, wojespn, get, jlm, trending, m...","[malikaandrew, wojespn, get, jlm, trend, mayb,...",malikaandrew wojespn get jlm trend mayb nba pu...
...,...,...,...,...,...
26395,hey have brainwashed their base for generation...,Riots,"[hey, brainwashed, base, generations, im, stil...","[hey, brainwash, base, generat, im, still, pis...",hey brainwash base generat im still piss trick...
26396,56blackcat these neanderthals were there for b...,Riots,"[56blackcat, neanderthals, blm, blacklivesmatt...","[56blackcat, neanderth, blm, blacklivesmatt, c...",56blackcat neanderth blm blacklivesmatt capito...
26397,atrupar 2 dear jaketapper instead of airing ta...,Riots,"[atrupar, 2, dear, jaketapper, instead, airing...","[atrupar, 2, dear, jaketapp, instead, air, tat...",atrupar 2 dear jaketapp instead air tatereev a...
26398,his is a bombshellmarjorie greene may have bee...,Riots,"[bombshellmarjorie, greene, may, genius, behin...","[bombshellmarjori, green, may, genius, behind,...",bombshellmarjori green may genius behind storm...


Create a new column, concatenating all stemmed tweets

In [15]:
df['stem_full'] = df['stem'].apply(lambda x: ' '.join(x))

In [16]:
y = [i for i in df['label']]

In [17]:
corpus = [i for i in df['stem_full']]
corpus

['silenc blm priti patel antiprotest law iandunt govern keen tackl cancel cultur',
 'illian42 johnbok5 nadiawhittomemp silli littl woke lefti most harmless rebel scum gamer intersectio',
 'errolwebb tell would consid racistdo peopl support blm think ok',
 'appl wont let parler app still keep twitter allow manner provok violenc vario',
 'malikaandrew wojespn get jlm trend mayb nba put bball court jer',
 'alexberesfordtv werent order pleas dont silenc voic hate bigot',
 'mom problem help mam bbc bbcslut nude sext cum cumshot creampi blm horni gangbang',
 'spyoutub amaz chat cincinnati health report ann saker spsaker annesak chat covid vaccin amidtheflow repo',
 'chrisshipitv recollect headlin clear racist racist citizen vocal given voic',
 'nagivatorjan widehop nickpy blm enter uk biggest racist go',
 'joedaddyhala thank repd stacey plaskett stand blm gqp never understand care learn blac',
 'ini pandem poems2021 90how react bump everywher turndur time',
 'ptnewsnetwork third day georg fl

In [18]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus)
X_train_counts.shape

(24329, 39220)

In [19]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(24329, 39220)

In [20]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y)

In [21]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(corpus, y)

### Check performance

Use `train_test_split` with training labels

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.33, random_state=42)

In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred = text_clf.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=tags))

accuracy 0.882799850541786
              precision    recall  f1-score   support

         BLM       0.96      0.84      0.90      1631
       Trump       0.87      0.83      0.85      1080
       Biden       0.82      1.00      0.90      2851
       Covid       0.98      0.88      0.93      1376
       Riots       0.89      0.70      0.79      1091

    accuracy                           0.88      8029
   macro avg       0.90      0.85      0.87      8029
weighted avg       0.89      0.88      0.88      8029



In [24]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.882799850541786