In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("pro_trump2_400.csv",index_col=0)

In [9]:
data.columns = ["tweets", "label"]

In [10]:
# 0 for anti-trump, 1 for pro-trump, 2 for unknown
data = data[data.iloc[:,1]!=2]
data

Unnamed: 0,tweets,label
0,"""So after Biden wins, what do we do with the c...",1
1,"""@GOPLeader @GoJackFlynn Please. \nJust #Remov...",1
2,"""@Jeromep81422970 @stillgray And I thought #Tr...",0
3,"""Seeing a lot of #TrumpSupporters taking their...",0
4,"""The bigger question is, why are hundreds of p...",0
...,...,...
388,"""#TrumpSupporters you suck\n#Republicans you s...",0
389,"""BIDEN’S WISCONSIN WELCOME PARTY: Just look at...",1
390,"""Name me one single way @JoeBiden can win over...",1
391,"""Donald can't say there'll be a peaceful trans...",1


In [25]:
# number of pro-trump tweets
sum(data['label'])

153

In [8]:
# ## add a fake label
# fakeLabel = np.random.randint(2,size=462)
# data["label"] = fakeLabel
# data.head()

In [11]:
# !pip install nltk



In [21]:
# basic modules
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import time
import re

# text preprocessing modules
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
nltk.download('stopwords')
from tokenize import tokenize


# models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to /home/xiao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#dowload stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
#download punkt for word_tokensize
nltk.download('punkt')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /home/xiao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/xiao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def remove_stopwords(input_data):
    '''
    input_data: a vector of messages
    output: stopwords removed
    '''
    # required:
    # import nltk
    # dowload stopwords
    # nltk.download('stopwords')
    stop_words = set(stopwords.words('english')) 
    output = []
    for i, sentence in enumerate(input_data):
        #words = word_tokenize(sentence)  #will not remove punctuations and numbers
        sms = re.sub("[^a-zA-Z]", " ", sentence).lower()
        words = word_tokenize(sms)
        filtered_sentence = [w for w in words if not w in stop_words]
        output.append(" ".join(filtered_sentence).lower() )
    return output

#https://rustyonrampage.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

print(reduce_lengthening( "finallllllly" ) )

def remove_stemmers(input_data):
    '''
    input: a vector of messages 
    output: remvove stemmers and punctuations
        all lower letter 
        stemmer removed
        punctuations removed
    '''
    ## require download punkt for word_tokensize
    #nltk.download('punkt')
    #stemmer = PorterStemmer()
    
    ## https://towardsdatascience.com/multi-class-text-classification-with-sklearn-and-nltk-in-python-a-software-engineering-use-case-779d4a28ba5
    ## df_train["cleaned"] = df_train['message'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
    output = []
    for i, sentence in enumerate(input_data):
        sms = re.sub("[^a-zA-Z]", " ", sentence) 
        words = word_tokenize(sms)
        filtered_sentence = [stemmer.stem(reduce_lengthening(w)) for w in words]
        output.append(" ".join(filtered_sentence) )
        
    return output

def messages_preprocess(input_data):      
    return remove_stemmers(remove_stopwords(input_data))

finally


In [16]:
#preprocess df_train
start = time.time()

data['cleaned'] = messages_preprocess(data["tweets"])  #about 1 min

end = time.time()
print("time used:",end - start)
data.head()

time used: 0.13688087463378906


Unnamed: 0,tweets,label,cleaned
0,"""So after Biden wins, what do we do with the c...",1,biden win cult
1,"""@GOPLeader @GoJackFlynn Please. \nJust #Remov...",1,goplead gojackflynn pleas njust removepelosi n...
2,"""@Jeromep81422970 @stillgray And I thought #Tr...",0,jeromep stillgray thought trumpsupport bot tru...
3,"""Seeing a lot of #TrumpSupporters taking their...",0,see lot trumpsupport take maga hat last night ...
4,"""The bigger question is, why are hundreds of p...",0,bigger question hundr peopl buse ralli


In [17]:
# creating the feature matrix 

#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=100000, min_df=1, max_df=0.7,stop_words=stopwords.words('english'))

#with uncleaned messages
count_vec = vectorizer.fit_transform(data['tweets'])
count_vec = count_vec.toarray()
print("###################################################")
print("Uncleaned bag of words size:", count_vec.shape)

#with cleaned messages
count_vec_clean = vectorizer.fit_transform(data['cleaned'])
count_vec_clean = count_vec_clean.toarray()
print("###################################################")
print("cleaned bag of words size:", count_vec_clean.shape)

#Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

#with uncleaned messages
tfidf_vec = TfidfVectorizer(max_features=100000, min_df=1, max_df=0.5,stop_words=stopwords.words('english'))
tfidf_vec = tfidf_vec.fit_transform(data['tweets']).toarray()
print("###################################################")
print("Uncleaned IT-IDF:", tfidf_vec.shape)

#with cleaned messages
tfidf_vec_clean = TfidfVectorizer(max_features=100000, min_df=1, max_df=0.5,stop_words=stopwords.words('english'))
tfidf_vec_clean = tfidf_vec_clean.fit_transform(data['cleaned']).toarray()
print("###################################################")
print("Cleaned IT-IDF:", tfidf_vec_clean.shape)


###################################################
Uncleaned bag of words size: (254, 1896)
###################################################
cleaned bag of words size: (254, 1689)
###################################################
Uncleaned IT-IDF: (254, 1896)
###################################################
Cleaned IT-IDF: (254, 1689)


In [22]:
# choose preprocessed data as training data
processed_list = [count_vec, count_vec_clean, tfidf_vec, tfidf_vec_clean]
processed_data = processed_list[2]  # [count_vec, count_vec_clean, tfidf_vec, tfidf_vec_clean]

In [23]:
# performance of count_vec(0.48) is worse than tfidf (0.54)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(processed_data,
                                                                                 data['label'], 
                                                                                 data.index, 
                                                                                 test_size=0.2, random_state=0)

In [24]:
from sklearn.naive_bayes import MultinomialNB

test_acc_list = []
train_acc_list = []
alpha_list = [0.1 + x*0.1 for x in range(10)]
for alpha in alpha_list:

    model = MultinomialNB(alpha = alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("########################################")
    print("alpha = ", alpha)
    
    test_acc_list += [accuracy_score(y_test, y_pred)]
    
    print("val acc = ", accuracy_score(y_test, y_pred))
    
    # performence on training set
    y_train_pred = model.predict(X_train)
    train_acc_list += [accuracy_score(y_train_pred, y_train)]
    print("train acc = ", accuracy_score(y_train_pred, y_train))


########################################
alpha =  0.1
val acc =  0.7058823529411765
train acc =  0.9950738916256158
########################################
alpha =  0.2
val acc =  0.6862745098039216
train acc =  0.9950738916256158
########################################
alpha =  0.30000000000000004
val acc =  0.6862745098039216
train acc =  0.9950738916256158
########################################
alpha =  0.4
val acc =  0.7058823529411765
train acc =  0.9901477832512315
########################################
alpha =  0.5
val acc =  0.7254901960784313
train acc =  0.9901477832512315
########################################
alpha =  0.6
val acc =  0.6862745098039216
train acc =  0.9901477832512315
########################################
alpha =  0.7000000000000001
val acc =  0.6862745098039216
train acc =  0.9802955665024631
########################################
alpha =  0.8
val acc =  0.7058823529411765
train acc =  0.9802955665024631
########################################
