In [1]:
#Python
import numpy as np
import pandas as pd
import time
import regex as reg
import os
from pathlib import Path
#NLP
import spacy as sp
from nltk.corpus import opinion_lexicon
#sklearn
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
len(opinion_lexicon.negative())
len(opinion_lexicon.positive())
#for w in range(len(opinion_lexicon.negative())):
#              opinion_lexicon.negative()[w]

4783

2006

In [34]:
file_path = '..'+os.sep+'Tweets_5_11/'
list_of_files = os.listdir(file_path)

In [35]:
#filepath = Path.home() 
#path = Path(filepath)
#list_of_files = path / 'Tweets_5_11'

In [36]:
tweet_df = pd.DataFrame()
for p in list_of_files:
    p = file_path+p
    temp_df = pd.read_csv(p,usecols = ['tweet_id','text','region'])
    tweet_df = tweet_df.append(temp_df)

In [37]:
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130092 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tweet_id  130092 non-null  object
 1   text      130089 non-null  object
 2   region    130086 non-null  object
dtypes: object(3)
memory usage: 4.0+ MB


tweet_id    0
text        3
region      6
dtype: int64

In [311]:
tweet_df.drop_duplicates(subset = 'tweet_id',inplace=True)

In [312]:
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92325 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  92325 non-null  object
 1   text      92323 non-null  object
 2   region    92321 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


tweet_id    0
text        2
region      4
dtype: int64

In [314]:
tweet_df.dropna(inplace=True)
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92319 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  92319 non-null  object
 1   text      92319 non-null  object
 2   region    92319 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


tweet_id    0
text        0
region      0
dtype: int64

In [315]:
tweet_df.reset_index(inplace=True)

In [316]:
tweet_df.head(15)

Unnamed: 0,index,tweet_id,text,region
0,0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA
1,1,1246587645779574784,Church congregants insisting on attending serv...,LA
2,2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA
3,3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA
4,4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA
5,5,1246584366471376896,"El Covid-19 iba a llamarse Dolly Parton, pero ...",LA
6,6,1246584353821315073,20-20 is perfect Vision but as a year 2020 is...,LA
7,7,1246583763267514368,S/o to the homie @yerrrchubbs for being on liv...,LA
8,8,1246583102534639617,Going on a road trip fuck covid,LA
9,9,1246582098523115520,@greggutfeld We are given these numbers about ...,LA


In [131]:
spacy_nlp = sp.load('en_core_web_sm')
processed_tweet = []

def clean_tweet(text):

#replace anything that isn't a-z or A-Z with spaces including punctuations, exclamation etc
#replace 'https://', 'www.' by space 
    ret_str = ' '
    regex = r"(.\\n|.\\t|.\\r)|([0-9]+)|([^\w\s])|(https{0,1}:\/\/\S*)|(ww[wa-zA-Z0-9.com]+)|(pic.twitter\S*)"
    ret_str = reg.sub(regex, '', str(text))
    ret_str = ret_str.rstrip() 
    ret_str = ret_str.lstrip()
    ret_str = ret_str.lower()
    doc = spacy_nlp(ret_str)
    all_lemmas =  [token.lemma_ for token in doc if  not token.is_stop \
                   and token.is_alpha and token.lemma_ != '-PRON-']
    processed_tweet.append(" ".join(all_lemmas)) # tweet cleaned up,tokenized.Copy each tweet as a string to the list
    return all_lemmas #tweet cleaned up, tokenized return a list of tokens

In [132]:
def process_tweet():
    clean_tweets = []
    start = time.perf_counter()
    counter = 0
    for row in tweet_df['text']:
        clean_tweets.append(clean_tweet(row))
        counter += 1
        
    print(f"Processed {counter} rows in training data")

    end = time.perf_counter()

    print(f"Took { round((end-start)/60,0)} minutes to clean")
    return clean_tweets

In [133]:
tweet_tokens = process_tweet()

Processed 93144 rows in training data
Took 11.0 minutes to clean


In [None]:
count = 0 
sentiment = []
start = time.perf_counter()

for tokens in tweet_tokens:
    positive_words = 0
    negative_words = 0
    count += 1
    for i in range(len(tokens)):
        if tokens[i] in opinion_lexicon.positive():
            positive_words += 1
        elif tokens[i] in opinion_lexicon.negative():
            negative_words += 1
        
    if positive_words > negative_words:
        sentiment.append(1) #postive
    elif positive_words < negative_words:
        sentiment.append(-1) #negative
    elif positive_words == negative_words:
        sentiment.append(0) #neutral
        
    if count%10_000 == 0:
        print(f"Processed {count} rows")
            
end = time.perf_counter()
print(f"Took { round((end-start)/60,0)} minutes for sentiment analysis")          

len(sentiment)

Processed 3000 rows
Processed 6000 rows
Processed 9000 rows
Processed 12000 rows
Processed 15000 rows
Processed 18000 rows
Processed 21000 rows
Processed 24000 rows
Processed 27000 rows
Processed 30000 rows
Processed 33000 rows


In [151]:
tweet_df['tweettokens'] = tweet_tokens
tweet_df['processedtweet'] = processed_tweet
tweet_df['sentiment'] = sentiment

In [197]:
tweet_df.info()
tweet_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93144 entries, 0 to 17
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        93144 non-null  object
 1   text            93144 non-null  object
 2   region          93144 non-null  object
 3   tweettokens     93144 non-null  object
 4   processedtweet  93144 non-null  object
 5   sentiment       93144 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 5.0+ MB


Unnamed: 0,tweet_id,text,region,tweettokens,processedtweet,sentiment
0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA,"[shaukat, khanum, memorial, cancer, hospital, ...",shaukat khanum memorial cancer hospital offer ...,0
1,1246587645779574784,Church congregants insisting on attending serv...,LA,"[church, congregant, insist, attend, service, ...",church congregant insist attend service wake c...,0
2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA,"[render, new, unit, affordable, housing, proje...",render new unit affordable housing project hav...,-1
3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA,"[double, date, covidstyle]",double date covidstyle,0
4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA,"[si, tuvieran, que, sacrificar, un, pueblo, pa...",si tuvieran que sacrificar un pueblo para acab...,0


In [157]:
tweet_df['sentiment'].value_counts(normalize = True)
# 0 - neutral, 1 - positive, -1 - negative

 0    0.374098
-1    0.321910
 1    0.303992
Name: sentiment, dtype: float64

In [259]:
tweet_df.loc[0]

index                                                             0
tweet_id                                        1246588222903214080
text              Shaukat Khanum Memorial Cancer Hospital offers...
region                                                           LA
tweettokens       [shaukat, khanum, memorial, cancer, hospital, ...
processedtweet    shaukat khanum memorial cancer hospital offer ...
sentiment                                                         0
Name: 0, dtype: object

In [168]:
X = tweet_df['processedtweet']
y = tweet_df['sentiment']

In [169]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state = 111)
X_train.shape
X_test.shape
y_test.shape
y_train.shape

(62406,)

(30738,)

(30738,)

(62406,)

In [228]:
def conf_matrix(gsobject,xdata,ydata):
    predicted = gsobject.predict(xdata)
    actual_predicted = pd.DataFrame({"Actual" : ydata, "Predicted": predicted, 'Text': xdata})
    return actual_predicted

In [199]:
#gs_type = grid search object
#xtraindata,xtestdata = xtrain and xtest 
#ytraindata,ytest = ytrain and ytest
def fit_grid_search(gs_type,xtraindata,ytraindata,xtestdata,ytestdata):
    start = time.perf_counter()
    gs_type.fit(xtraindata,ytraindata)
    print(f"Grid search accuracy for estimator \033[1m{gs_type.estimator.steps[1][0]}\033[0m "
          f"transformer \033[1m{gs_type.estimator.steps[0][0]}\033[0m : \n")
    print(f"On training data is {gs_type.score(xtraindata,ytraindata)}")
    print(f"On test data is {gs_type.score(xtestdata,ytestdata)}")
    print(f"Grid search best score (avg of cv scores) {gs_type.best_score_}\n\n")
    print(f"Model with best fitting parameter is \n {gs_type.best_estimator_.get_params}\n\n")
    #print(gs_type.best_estimator_.named_steps['logisticregression'].coef_)
    end = time.perf_counter()
    print(f"Took \033[1m{ round((end-start)/60,0)}\033[0m minutes to complete")
    cv_results_df = pd.DataFrame(gs_type.cv_results_)
    return cv_results_df,gs_type

In [238]:
pipette_tfidf = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipe_params_tfidf = {
    'tfidfvectorizer__max_features': [15_000,25_000,30_000],
    'tfidfvectorizer__ngram_range': [ (1,2),(1,3)],
    'tfidfvectorizer__min_df' : [5,10], #discard words that appear in less than 5 or 10 documents
    'tfidfvectorizer__max_df' : [0.80], #exclude words that cross this threshold(how many documents contained a term)
    'tfidfvectorizer__stop_words' : [None],
    'logisticregression__max_iter': [1000]
}

In [239]:
gs_tfidf = GridSearchCV(pipette_tfidf, 
                  pipe_params_tfidf, 
                  cv = 5 ,
                  verbose = 1)

In [240]:
results_tfidf,gs_tfidf = fit_grid_search(gs_tfidf,X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  8.0min finished


Grid search accuracy for estimator [1mlogisticregression[0m transformer [1mtfidfvectorizer[0m : 

On training data is 0.8896900939012274
On test data is 0.8263062007938057
Grid search best score (avg of cv scores) 0.8162034929529354


Model with best fitting parameter is 
 <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.8, max_features=15000,
                                 min_df=10, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                           

In [241]:
print(f"\033[1mTraining set data\033[0m")
actual_v_predicted_train= conf_matrix(gs_tfidf,X_train,y_train)

mask = actual_v_predicted_train[(actual_v_predicted_train['Actual']) != (actual_v_predicted_train['Predicted'])]
mask

print(f"\033[1mTesting set data\033[0m")
actual_v_predicted_test = conf_matrix(gs_tfidf,X_test,y_test)

mask = actual_v_predicted_test[(actual_v_predicted_test['Actual']) != (actual_v_predicted_test['Predicted'])]
mask

[1mTraining set data[0m


Unnamed: 0,Actual,Predicted,Text
2810,-1,1,quarantine jam groove friend dvibesonda denver...
3357,1,-1,feel like helpful distinction maybe harmful ac...
235,0,1,austrian rail company look worker shorttime wo...
5677,0,-1,imply premise original question covid overcod ...
1510,0,1,open prematurely shall place emergency icu ppe...
...,...,...,...
5368,0,1,come people overdose ailment san francisco goo...
6020,0,-1,covid pandemic crisis connected mediafrenzy hi...
127,0,1,tx share need reduce documentation burden set ...
3451,0,-1,easily big benefactor corona virus guy have pe...


[1mTesting set data[0m


Unnamed: 0,Actual,Predicted,Text
2299,0,1,great help boil census want student count covid
2035,1,0,coronavirus newsom say californian expect infe...
3330,0,-1,good chance catch corona virus feeling
8600,0,-1,confirm case covid find america trump virus
4403,0,1,mood bored star rona wonder long s go to s lon...
...,...,...,...
2753,1,0,corona update have get mad compliment face mas...
4949,-1,0,mood day quarantine lockdown passthe corona vi...
12601,-1,0,task california firefighter difficult covid ne...
7723,1,0,confirmen si todo estábamos en la mejor etapa ...


In [323]:
pipette_cvect = make_pipeline(CountVectorizer(),LogisticRegression())
pipe_params_cvect = {
    'countvectorizer__max_features': [15_000,25_000,30_000],
    'countvectorizer__ngram_range':  [(1,2),(1,3)],
    'countvectorizer__min_df' : [5,10], #discard words that appear in less than 5 or 10 documents
    'countvectorizer__max_df' : [0.8], #exclude words that cross this threshold(how many docs contained a term)
    'countvectorizer__stop_words' : [None],
    'logisticregression__max_iter': [1000]
}

In [324]:
gs_cvect = GridSearchCV(pipette_cvect, 
                  pipe_params_cvect, 
                  cv = 5,
                  verbose = 1)

In [325]:
results_cvect,gs_cvect = fit_grid_search(gs_cvect,X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  9.3min finished


Grid search accuracy for estimator [1mlogisticregression[0m transformer [1mcountvectorizer[0m : 

On training data is 0.9624875813223087
On test data is 0.876179322011842
Grid search best score (avg of cv scores) 0.860285807155606


Model with best fitting parameter is 
 <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.8,
                                 max_features=15000, min_df=10,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
         

In [329]:
print(f"\033[1mTraining set data\033[0m")
actual_v_predicted_train= conf_matrix(gs_cvect,X_train,y_train)

mask = actual_v_predicted_train[(actual_v_predicted_train['Actual']) != (actual_v_predicted_train['Predicted'])]
mask

print(f"\033[1mTesting set data\033[0m")
actual_v_predicted_test = conf_matrix(gs_cvect,X_test,y_test)

mask = actual_v_predicted_test[(actual_v_predicted_test['Actual']) != (actual_v_predicted_test['Predicted'])]
mask

[1mTraining set data[0m


Unnamed: 0,Actual,Predicted,Text
2810,-1,0,quarantine jam groove friend dvibesonda denver...
7542,0,-1,break live magic covid quarantinelife
7218,0,-1,rain la not gym covid wait today perfectstorm ...
9917,-1,0,idea wish not consistency base explain go fact...
492,-1,0,rhetoric like carville not help s true express...
...,...,...,...
14281,1,0,appropriate
6936,-1,0,see celebrity etc facetime etc home be realize...
2749,0,1,coincidentally success rate sbs generally unaf...
5329,0,-1,fact asian american asian immigrant deserve th...


[1mTesting set data[0m


Unnamed: 0,Actual,Predicted,Text
2299,0,1,great help boil census want student count covid
4403,0,1,mood bored star rona wonder long s go to s lon...
4710,1,0,be walk dog neighbor start walk close quaranti...
8009,1,0,ios update iphone set include new software dev...
8814,0,1,like airmail covid awful
...,...,...,...
6939,-1,1,dear speakerpelosi impeach donald trump type c...
2364,1,0,patience wear thin sixfeetapart quarantine qua...
2753,1,0,corona update have get mad compliment face mas...
12601,-1,0,task california firefighter difficult covid ne...
