In [42]:
#Python
import numpy as np
import pandas as pd
import time
import regex as reg
import os
from pathlib import Path
#NLP
import spacy as sp
from nltk.corpus import opinion_lexicon
#sklearn
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
#Serialization 
import dill

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
dill.dump_session('tweet_sentiment.db')

In [7]:
len(opinion_lexicon.negative())
len(opinion_lexicon.positive())
#for w in range(len(opinion_lexicon.negative())):
#              opinion_lexicon.negative()[w]

4783

2006

In [23]:
filepath = Path.home() 
path = Path(filepath)
list_of_files = path / 'DSI/ClientProject/Tweets_511'

In [14]:
tweet_df = pd.DataFrame()
for p in list_of_files.iterdir():
    temp_df = pd.read_csv(p,usecols = ['tweet_id','text','region'])
    tweet_df = tweet_df.append(temp_df)

In [310]:
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130092 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tweet_id  130092 non-null  object
 1   text      130089 non-null  object
 2   region    130086 non-null  object
dtypes: object(3)
memory usage: 4.0+ MB


tweet_id    0
text        3
region      6
dtype: int64

In [15]:
tweet_df.drop_duplicates(subset = 'tweet_id',inplace=True)

In [16]:
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92325 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  92325 non-null  object
 1   text      92323 non-null  object
 2   region    92321 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


tweet_id    0
text        2
region      4
dtype: int64

In [17]:
tweet_df.dropna(inplace=True)
tweet_df.info()
tweet_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92319 entries, 0 to 17
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  92319 non-null  object
 1   text      92319 non-null  object
 2   region    92319 non-null  object
dtypes: object(3)
memory usage: 2.8+ MB


tweet_id    0
text        0
region      0
dtype: int64

In [46]:
tweet_df.reset_index(inplace=True)
tweet_df = tweet_df.copy()

In [19]:
tweet_df.head(15)

Unnamed: 0,index,tweet_id,text,region
0,0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA
1,1,1246587645779574784,Church congregants insisting on attending serv...,LA
2,2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA
3,3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA
4,4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA
5,5,1246584366471376896,"El Covid-19 iba a llamarse Dolly Parton, pero ...",LA
6,6,1246584353821315073,20-20 is perfect Vision but as a year 2020 is...,LA
7,7,1246583763267514368,S/o to the homie @yerrrchubbs for being on liv...,LA
8,8,1246583102534639617,Going on a road trip fuck covid,LA
9,9,1246582098523115520,@greggutfeld We are given these numbers about ...,LA


In [20]:
spacy_nlp = sp.load('en_core_web_sm')
processed_tweet = []

def clean_tweet(text):

#replace anything that isn't a-z or A-Z with spaces including punctuations, exclamation etc
#replace 'https://', 'www.' by space 
    ret_str = ' '
    regex = r"(.\\n|.\\t|.\\r)|([0-9]+)|([^\w\s])|(https{0,1}:\/\/\S*)|(ww[wa-zA-Z0-9.com]+)|(pic.twitter\S*)"
    ret_str = reg.sub(regex, '', str(text))
    ret_str = ret_str.rstrip() 
    ret_str = ret_str.lstrip()
    ret_str = ret_str.lower()
    doc = spacy_nlp(ret_str)
    all_lemmas =  [token.lemma_ for token in doc if  not token.is_stop \
                   and token.is_alpha and token.lemma_ != '-PRON-']
    processed_tweet.append(" ".join(all_lemmas)) # tweet cleaned up,tokenized.Copy each tweet as a string to the list
    return all_lemmas #tweet cleaned up, tokenized return a list of tokens

In [21]:
def process_tweet():
    clean_tweets = []
    start = time.perf_counter()
    counter = 0
    for row in tweet_df['text']:
        clean_tweets.append(clean_tweet(row))
        counter += 1
        
    print(f"Processed {counter} rows in training data")

    end = time.perf_counter()

    print(f"Took { round((end-start)/60,0)} minutes to clean")
    return clean_tweets

In [22]:
tweet_tokens = process_tweet()

Processed 92319 rows in training data
Took 14.0 minutes to clean


In [24]:
count = 0 
sentiment = []
start = time.perf_counter()

for tokens in tweet_tokens:
    positive_words = 0
    negative_words = 0
    count += 1
    for i in range(len(tokens)):
        if tokens[i] in opinion_lexicon.positive():
            positive_words += 1
        elif tokens[i] in opinion_lexicon.negative():
            negative_words += 1
        
    if positive_words > negative_words:
        sentiment.append(1) #postive
    elif positive_words < negative_words:
        sentiment.append(-1) #negative
    elif positive_words == negative_words:
        sentiment.append(0) #neutral
        
    if count%10_000 == 0:
        print(f"Processed {count} rows")
            
end = time.perf_counter()
print(f"Took { round((end-start)/60,0)} minutes for sentiment analysis")          

len(sentiment)

Processed 10000 rows
Processed 20000 rows
Processed 30000 rows
Processed 40000 rows
Processed 50000 rows
Processed 60000 rows
Processed 70000 rows
Processed 80000 rows
Processed 90000 rows
Took 276.0 minutes for sentiment analysis


92319

In [25]:
tweet_df['tweettokens'] = tweet_tokens
tweet_df['processedtweet'] = processed_tweet
tweet_df['sentiment'] = sentiment

In [26]:
tweet_df.info()
tweet_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92319 entries, 0 to 92318
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           92319 non-null  int64 
 1   tweet_id        92319 non-null  object
 2   text            92319 non-null  object
 3   region          92319 non-null  object
 4   tweettokens     92319 non-null  object
 5   processedtweet  92319 non-null  object
 6   sentiment       92319 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


Unnamed: 0,index,tweet_id,text,region,tweettokens,processedtweet,sentiment
0,0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA,"[shaukat, khanum, memorial, cancer, hospital, ...",shaukat khanum memorial cancer hospital offer ...,0
1,1,1246587645779574784,Church congregants insisting on attending serv...,LA,"[church, congregant, insist, attend, service, ...",church congregant insist attend service wake c...,0
2,2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA,"[render, new, unit, affordable, housing, proje...",render new unit affordable housing project hav...,-1
3,3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA,"[double, date, covidstyle]",double date covidstyle,0
4,4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA,"[si, tuvieran, que, sacrificar, un, pueblo, pa...",si tuvieran que sacrificar un pueblo para acab...,0


In [27]:
tweet_df['sentiment'].value_counts(normalize = True)
# 0 - neutral, 1 - positive, -1 - negative

 0    0.373303
-1    0.322675
 1    0.304022
Name: sentiment, dtype: float64

In [28]:
tweet_df.loc[0]

index                                                             0
tweet_id                                        1246588222903214080
text              Shaukat Khanum Memorial Cancer Hospital offers...
region                                                           LA
tweettokens       [shaukat, khanum, memorial, cancer, hospital, ...
processedtweet    shaukat khanum memorial cancer hospital offer ...
sentiment                                                         0
Name: 0, dtype: object

In [29]:
X = tweet_df['processedtweet']
y = tweet_df['sentiment']

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state = 111)
X_train.shape
X_test.shape
y_test.shape
y_train.shape

(61853,)

(30466,)

(30466,)

(61853,)

In [31]:
def conf_matrix(gsobject,xdata,ydata):
    predicted = gsobject.predict(xdata)
    actual_predicted = pd.DataFrame({"Actual" : ydata, "Predicted": predicted, 'Text': xdata})
    return actual_predicted

In [32]:
#gs_type = grid search object
#xtraindata,xtestdata = xtrain and xtest 
#ytraindata,ytest = ytrain and ytest
def fit_grid_search(gs_type,xtraindata,ytraindata,xtestdata,ytestdata):
    start = time.perf_counter()
    gs_type.fit(xtraindata,ytraindata)
    print(f"Grid search accuracy for estimator \033[1m{gs_type.estimator.steps[1][0]}\033[0m "
          f"transformer \033[1m{gs_type.estimator.steps[0][0]}\033[0m : \n")
    print(f"On training data is {gs_type.score(xtraindata,ytraindata)}")
    print(f"On test data is {gs_type.score(xtestdata,ytestdata)}")
    print(f"Grid search best score (avg of cv scores) {gs_type.best_score_}\n\n")
    print(f"Model with best fitting parameter is \n {gs_type.best_estimator_.get_params}\n\n")
    #print(gs_type.best_estimator_.named_steps['logisticregression'].coef_)
    end = time.perf_counter()
    print(f"Took \033[1m{ round((end-start)/60,0)}\033[0m minutes to complete")
    cv_results_df = pd.DataFrame(gs_type.cv_results_)
    return cv_results_df,gs_type

In [33]:
pipette_tfidf = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipe_params_tfidf = {
    'tfidfvectorizer__max_features': [15_000,25_000,30_000],
    'tfidfvectorizer__ngram_range': [ (1,2),(1,3)],
    'tfidfvectorizer__min_df' : [5,10], #discard words that appear in less than 5 or 10 documents
    'tfidfvectorizer__max_df' : [0.80], #exclude words that cross this threshold(how many documents contained a term)
    'tfidfvectorizer__stop_words' : [None],
    'logisticregression__max_iter': [1000]
}

In [34]:
gs_tfidf = GridSearchCV(pipette_tfidf, 
                  pipe_params_tfidf, 
                  cv = 5 ,
                  verbose = 1)

In [35]:
results_tfidf,gs_tfidf = fit_grid_search(gs_tfidf,X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  9.4min finished


Grid search accuracy for estimator [1mlogisticregression[0m transformer [1mtfidfvectorizer[0m : 

On training data is 0.8878954941554977
On test data is 0.828169106545001
Grid search best score (avg of cv scores) 0.8144632448419834


Model with best fitting parameter is 
 <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.8, max_features=15000,
                                 min_df=10, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                            

In [36]:
print(f"\033[1mTraining set data\033[0m")
actual_v_predicted_train= conf_matrix(gs_tfidf,X_train,y_train)

mask = actual_v_predicted_train[(actual_v_predicted_train['Actual']) != (actual_v_predicted_train['Predicted'])]
mask

print(f"\033[1mTesting set data\033[0m")
actual_v_predicted_test = conf_matrix(gs_tfidf,X_test,y_test)

mask = actual_v_predicted_test[(actual_v_predicted_test['Actual']) != (actual_v_predicted_test['Predicted'])]
mask

[1mTraining set data[0m


Unnamed: 0,Actual,Predicted,Text
7580,0,1,let support elon musk elonmusk battle leviatha...
89173,1,0,acid queen retake covid queen be covid pay tes...
23513,0,1,agree gov newsom issue pretty good job handle ...
47415,-1,0,painting covid era desert dusk oil canvas x
36882,1,0,shiny shop shuttered beverlyhill covid pandemi...
...,...,...,...
35662,0,-1,take action ask ny legislature governor cuomo ...
19595,0,1,potus know old citizen age group infect covid ...
38497,-1,0,confusion date federal filling payment july th...
28118,1,0,have quarantine year not wanna motherfucker st...


[1mTesting set data[0m


Unnamed: 0,Actual,Predicted,Text
26903,1,0,joebiden human sane person vote governor covid...
21546,0,-1,bernie pony cash corona virus vaccine free
16171,0,1,trump say restart economy brutish political id...
50943,0,1,sure protect covid burn clothe end day add cup...
85249,1,0,wonderful world hưng blackheart diy video stay...
...,...,...,...
82183,-1,0,cmon jam not close able american social distan...
21094,1,-1,ha right trump assure corona virus problem lov...
3373,0,1,like bad bewitch spinoff karma twin cousin cor...
15814,0,1,not miss tomorrow covid webinar great panel im...


In [37]:
pipette_cvect = make_pipeline(CountVectorizer(),LogisticRegression())
pipe_params_cvect = {
    'countvectorizer__max_features': [15_000,25_000,30_000],
    'countvectorizer__ngram_range':  [(1,2),(1,3)],
    'countvectorizer__min_df' : [5,10], #discard words that appear in less than 5 or 10 documents
    'countvectorizer__max_df' : [0.8], #exclude words that cross this threshold(how many docs contained a term)
    'countvectorizer__stop_words' : [None],
    'logisticregression__max_iter': [1000]
}

In [38]:
gs_cvect = GridSearchCV(pipette_cvect, 
                  pipe_params_cvect, 
                  cv = 5,
                  verbose = 1)

In [39]:
results_cvect,gs_cvect = fit_grid_search(gs_cvect,X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 11.0min finished


Grid search accuracy for estimator [1mlogisticregression[0m transformer [1mcountvectorizer[0m : 

On training data is 0.962168366934506
On test data is 0.8778966716995995
Grid search best score (avg of cv scores) 0.859925934430714


Model with best fitting parameter is 
 <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.8,
                                 max_features=15000, min_df=10,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
         

In [41]:
print(f"\033[1mTraining set data\033[0m")
actual_v_predicted_train= conf_matrix(gs_cvect,X_train,y_train)

mask = actual_v_predicted_train[(actual_v_predicted_train['Actual']) != (actual_v_predicted_train['Predicted'])]
mask

print(f"\033[1mTesting set data\033[0m")
actual_v_predicted_test = conf_matrix(gs_cvect,X_test,y_test)

mask = actual_v_predicted_test[(actual_v_predicted_test['Actual']) != (actual_v_predicted_test['Predicted'])]
mask

[1mTraining set data[0m


Unnamed: 0,Actual,Predicted,Text
89173,1,0,acid queen retake covid queen be covid pay tes...
23513,0,1,agree gov newsom issue pretty good job handle ...
36882,1,0,shiny shop shuttered beverlyhill covid pandemi...
35999,0,1,imagine work class people not child test covid...
6890,-1,0,impact covid discriminate ensure equal access ...
...,...,...,...
2338,1,0,go iphone photo delete dumb one perfect activi...
85654,1,0,artist lover need voice hear keepartswork arts...
21736,0,-1,defeat corona virus
79942,0,1,not sound good coronavirus alert rare syndrome...


[1mTesting set data[0m


Unnamed: 0,Actual,Predicted,Text
85249,1,0,wonderful world hưng blackheart diy video stay...
80757,0,1,lose job covid yes buy ticket good friend kare...
13656,-1,0,governor executive order prevent community spr...
6353,0,1,kind slap face work sacrifice wear mask opinio...
61613,0,1,happy friday hopefully sun weekend remember lo...
...,...,...,...
20506,0,-1,feel like reason response covid bad realdonald...
35143,0,1,hop find path forward allow continue dogather ...
3373,0,1,like bad bewitch spinoff karma twin cousin cor...
15814,0,-1,not miss tomorrow covid webinar great panel im...


In [48]:
output_df = pd.merge(tweet_df,actual_v_predicted_test[['Predicted']],how = 'left', left_index= True, right_index=True)
output_df = pd.merge(output_df,actual_v_predicted_train[['Predicted']],how = 'left', left_index= True, right_index=True)

In [49]:
output_df.head()
output_df.tail()
output_df.info()

Unnamed: 0,level_0,index,tweet_id,text,region,tweettokens,processedtweet,sentiment,Predicted_x,Predicted_y
0,0,0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA,"[shaukat, khanum, memorial, cancer, hospital, ...",shaukat khanum memorial cancer hospital offer ...,0,0.0,
1,1,1,1246587645779574784,Church congregants insisting on attending serv...,LA,"[church, congregant, insist, attend, service, ...",church congregant insist attend service wake c...,0,,0.0
2,2,2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA,"[render, new, unit, affordable, housing, proje...",render new unit affordable housing project hav...,-1,-1.0,
3,3,3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA,"[double, date, covidstyle]",double date covidstyle,0,,0.0
4,4,4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA,"[si, tuvieran, que, sacrificar, un, pueblo, pa...",si tuvieran que sacrificar un pueblo para acab...,0,,0.0


Unnamed: 0,level_0,index,tweet_id,text,region,tweettokens,processedtweet,sentiment,Predicted_x,Predicted_y
92314,92314,13,1259997652449669121,A possibility that #COVID19 survivors’ blood c...,SF,"[possibility, covid, survivor, blood, develop,...",possibility covid survivor blood develop treat...,-1,0.0,
92315,92315,14,1259996984682872833,Stepping up to provide #DirectCashAssistance t...,SF,"[step, provide, directcashassistance, help, la...",step provide directcashassistance help la crea...,0,,0.0
92316,92316,15,1259996804856401920,Free webinar this Friday on how to pivot your ...,SF,"[free, webinar, friday, pivot, smallbiz, join,...",free webinar friday pivot smallbiz join pcvtwe...,1,,1.0
92317,92317,16,1259996787496022016,What support looks like in action — @jihern_ &...,SF,"[support, look, like, action, jihern, poufu, l...",support look like action jihern poufu learn mo...,1,,1.0
92318,92318,17,1259996720060039168,‘YodiYil’ Offers #Free #GPSTracker For Custome...,SF,"[yodiyil, offer, free, gpstracker, customer, d...",yodiyil offer free gpstracker customer directl...,1,,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92319 entries, 0 to 92318
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   level_0         92319 non-null  int64  
 1   index           92319 non-null  int64  
 2   tweet_id        92319 non-null  object 
 3   text            92319 non-null  object 
 4   region          92319 non-null  object 
 5   tweettokens     92319 non-null  object 
 6   processedtweet  92319 non-null  object 
 7   sentiment       92319 non-null  int64  
 8   Predicted_x     30466 non-null  float64
 9   Predicted_y     61853 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 7.0+ MB


In [51]:
output_df.drop(columns = 'level_0',inplace = True)

In [52]:
output_df.head()
output_df.tail()
output_df.info()

Unnamed: 0,index,tweet_id,text,region,tweettokens,processedtweet,sentiment,Predicted_x,Predicted_y
0,0,1246588222903214080,Shaukat Khanum Memorial Cancer Hospital offers...,LA,"[shaukat, khanum, memorial, cancer, hospital, ...",shaukat khanum memorial cancer hospital offer ...,0,0.0,
1,1,1246587645779574784,Church congregants insisting on attending serv...,LA,"[church, congregant, insist, attend, service, ...",church congregant insist attend service wake c...,0,,0.0
2,2,1246585369899892738,Rendering of a new 44-unit affordable housing ...,LA,"[render, new, unit, affordable, housing, proje...",render new unit affordable housing project hav...,-1,-1.0,
3,3,1246584511732695040,"Double date, covid-style. pic.twitter.com/LWh2...",LA,"[double, date, covidstyle]",double date covidstyle,0,,0.0
4,4,1246584386583068672,Si tuvieran que sacrificar un pueblo para acab...,LA,"[si, tuvieran, que, sacrificar, un, pueblo, pa...",si tuvieran que sacrificar un pueblo para acab...,0,,0.0


Unnamed: 0,index,tweet_id,text,region,tweettokens,processedtweet,sentiment,Predicted_x,Predicted_y
92314,13,1259997652449669121,A possibility that #COVID19 survivors’ blood c...,SF,"[possibility, covid, survivor, blood, develop,...",possibility covid survivor blood develop treat...,-1,0.0,
92315,14,1259996984682872833,Stepping up to provide #DirectCashAssistance t...,SF,"[step, provide, directcashassistance, help, la...",step provide directcashassistance help la crea...,0,,0.0
92316,15,1259996804856401920,Free webinar this Friday on how to pivot your ...,SF,"[free, webinar, friday, pivot, smallbiz, join,...",free webinar friday pivot smallbiz join pcvtwe...,1,,1.0
92317,16,1259996787496022016,What support looks like in action — @jihern_ &...,SF,"[support, look, like, action, jihern, poufu, l...",support look like action jihern poufu learn mo...,1,,1.0
92318,17,1259996720060039168,‘YodiYil’ Offers #Free #GPSTracker For Custome...,SF,"[yodiyil, offer, free, gpstracker, customer, d...",yodiyil offer free gpstracker customer directl...,1,,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92319 entries, 0 to 92318
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           92319 non-null  int64  
 1   tweet_id        92319 non-null  object 
 2   text            92319 non-null  object 
 3   region          92319 non-null  object 
 4   tweettokens     92319 non-null  object 
 5   processedtweet  92319 non-null  object 
 6   sentiment       92319 non-null  int64  
 7   Predicted_x     30466 non-null  float64
 8   Predicted_y     61853 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 6.3+ MB


In [53]:
output_df.to_csv('model_loc_sentiment.csv',index = False)