In [1]:
import pandas as pd
import boto3
import requests
from io import StringIO
from botocore.client import Config
from smart_open import smart_open

import random

import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

import pickle

pd.set_option('display.max_colwidth', None)


## Loading Data

In [2]:
root_tweet_clean = pd.read_csv('Datos1_Kike/tweets_cleaned_labeled.csv')

In [3]:
root_tweet_clean

Unnamed: 0,text,clean_text,class
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland,coronavirus new deaths 727 new cases confirmed ireland,-1
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can’t be anyway that you think the sectors you’ve mentioned have shaped the covid response/policy,definitely grenade tweet lobbed twittersphere people go mad atthere can’t anyway think sectors you’ve mentioned shaped covid responsepolicy,-1
2,Everyone in Kolkata now is Covid Aladin.,everyone kolkata covid aladin,0
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories,1
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year.",2020 crazy year bought house 27 partner got married got covid rollercoaster year,-1
...,...,...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment.",course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment,-1
51996,French President Emmanuel Macron tests positive for coronavirus - National |,french president emmanuel macron tests positive coronavirus national,1
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker.",god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker,-1
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs.",abandoning wells anyway ones taking advantage federal program dorin told global news interview words creating jobs,-1


In [4]:
def text_process(mess):
    """
    1. remove punc
    2. remove stop words
    3. return list of clean words
    """
    #print(mess)
    nopunc = [char for char in str(mess) if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    nopunc = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return(nopunc)

In [5]:
root_tweet_clean['new_treat']=root_tweet_clean.clean_text.apply(text_process)
root_tweet_clean['new_treat'] = [' '.join(map(str, l)) for l in root_tweet_clean['new_treat']]

## Creating the  "Bag of Words" transformer 

With the cleanning function previously created

In [6]:
bow_transformer = CountVectorizer().fit(root_tweet_clean['new_treat'])

In [7]:
print(len(bow_transformer.vocabulary_))

60474


In [8]:
bow_transformer.vocabulary_

{'coronavirus': 14135,
 'new': 37347,
 'deaths': 16555,
 '727': 3089,
 'cases': 10997,
 'confirmed': 13540,
 'ireland': 29209,
 'definitely': 16882,
 'grenade': 24980,
 'tweet': 55727,
 'lobbed': 32900,
 'twittersphere': 55786,
 'people': 40958,
 'go': 24445,
 'mad': 33623,
 'atthere': 6820,
 'can': 10574,
 'anyway': 5959,
 'think': 53890,
 'sectors': 48132,
 'you': 60136,
 've': 57349,
 'mentioned': 35000,
 'shaped': 48751,
 'covid': 14513,
 'responsepolicy': 45703,
 'everyone': 20863,
 'kolkata': 31298,
 'aladin': 4842,
 'beckys': 7920,
 'mother': 36276,
 'hasnt': 25804,
 'held': 26173,
 'granddaughter': 24825,
 'yetbalors': 60069,
 'father': 21758,
 'went': 58674,
 'covid19': 14524,
 'triple': 55299,
 'heart': 26075,
 'bypass': 10328,
 'surgeryboth': 52255,
 'parents': 40355,
 'themand': 53653,
 'sure': 52226,
 'theres': 53784,
 'great': 24905,
 'many': 33992,
 'stories': 51497,
 '2020': 1292,
 'crazy': 15319,
 'year': 59930,
 'bought': 9344,
 'house': 27150,
 '27': 1745,
 'partner'

In [9]:
messages_bow = bow_transformer.transform(root_tweet_clean['new_treat'])

In [10]:
messages_bow.shape

(52000, 60474)

## Sparsity

Used to determine the amount of zero coefficients inside our DF 

In [11]:
sparsity = ( 100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1] ) )
print('Sparsity: {}'.format(sparsity))

Sparsity: 0.023024675575771914


## TF-IDF

Tranformer which objective is to determine how relevant/important a word is, from the document the same word belongs to, document can be called "Corpus" as well

In [12]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [13]:
tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]

7.541530114611576

In [14]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

## Random Forest Model

Is suitable for dealing with a high dimensional problem such as text representation, also it can discern between noisy data inside the text calssification 

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
                                                messages_tfidf,
                                                root_tweet_clean['class'],
                                                test_size=0.33,
                                                random_state=42)

stress_detect_model = RandomForestClassifier(random_state=42, n_jobs = -1)
stress_detect_model.fit(X_train,y_train)
stress_detect_model.score(X_test,y_test)

0.724009324009324

In [17]:
mess4 = root_tweet_clean['new_treat'][3]
print(mess4)

beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories


Checking the BoW transformation

In [18]:
bow4 = bow_transformer.transform([mess4])
print(bow4)

  (0, 7920)	1
  (0, 10328)	1
  (0, 14524)	1
  (0, 21758)	1
  (0, 24825)	1
  (0, 24905)	1
  (0, 25804)	1
  (0, 26075)	1
  (0, 26173)	1
  (0, 33992)	1
  (0, 36276)	1
  (0, 40355)	1
  (0, 51497)	1
  (0, 52226)	1
  (0, 52255)	1
  (0, 53653)	1
  (0, 53784)	1
  (0, 55299)	1
  (0, 58674)	1
  (0, 60069)	1


Checking the tf idf transformation

In [19]:
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 60069)	0.30792539006155484
  (0, 58674)	0.16026993048212637
  (0, 55299)	0.26644688676241896
  (0, 53784)	0.16928939717357022
  (0, 53653)	0.30792539006155484
  (0, 52255)	0.30792539006155484
  (0, 52226)	0.15003892486792286
  (0, 51497)	0.18476737636473342
  (0, 40355)	0.18322634012362005
  (0, 36276)	0.1964286574393601
  (0, 33992)	0.12665982518870555
  (0, 26173)	0.20174843378126223
  (0, 26075)	0.18606537466494688
  (0, 25804)	0.20872096827538367
  (0, 24905)	0.14544932041159642
  (0, 24825)	0.2888102117620441
  (0, 21758)	0.21506421716405671
  (0, 14524)	0.06127618483850081
  (0, 10328)	0.2776285492622315
  (0, 7920)	0.30792539006155484


In [20]:
stress_detect_model.predict(tfidf4)[0]

1

In [21]:
all_pred = stress_detect_model.predict(messages_tfidf)
all_pred

array([-1, -1,  0, ..., -1, -1,  1])

In [22]:
msg_train,msg_test,label_train,label_test = train_test_split(root_tweet_clean['new_treat'],root_tweet_clean['class'],test_size=0.2)

## Pipeline Creation

It is used to minimize the use of code, and can be stored to export it to a different application or system

In [23]:
from sklearn.pipeline import Pipeline

In [24]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier(random_state=42, n_jobs = -1))
])

In [25]:
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f8aa8565b70>)),
                ('tfidf', TfidfTransformer()),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42))])

In [26]:
prediction = pipeline.predict(msg_test)

In [27]:
from sklearn.metrics import classification_report
print( classification_report(label_test,prediction) )

              precision    recall  f1-score   support

          -1       0.76      0.84      0.79      5262
           0       0.67      0.75      0.71      3102
           1       0.77      0.41      0.54      2036

    accuracy                           0.73     10400
   macro avg       0.73      0.67      0.68     10400
weighted avg       0.73      0.73      0.72     10400



In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label_test, prediction)

array([[4401,  679,  182],
       [ 695, 2333,   74],
       [ 719,  473,  844]])

In [29]:
from sklearn.metrics import precision_score
precision_score(label_test, prediction,average='weighted')

0.7328116727231215

In [30]:
predictiontrn = pipeline.predict(msg_train)

In [31]:
precision_score(label_train, predictiontrn,average='weighted')

0.999399523133785

In [32]:
root_tweet_clean['class'].value_counts()

-1    26028
 0    15714
 1    10258
Name: class, dtype: int64

In [33]:
print(len(label_test),len(label_train))

10400 41600


In [34]:
root_tweet_clean

Unnamed: 0,text,clean_text,class,new_treat
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland,coronavirus new deaths 727 new cases confirmed ireland,-1,coronavirus new deaths 727 new cases confirmed ireland
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can’t be anyway that you think the sectors you’ve mentioned have shaped the covid response/policy,definitely grenade tweet lobbed twittersphere people go mad atthere can’t anyway think sectors you’ve mentioned shaped covid responsepolicy,-1,definitely grenade tweet lobbed twittersphere people go mad atthere can’t anyway think sectors you’ve mentioned shaped covid responsepolicy
2,Everyone in Kolkata now is Covid Aladin.,everyone kolkata covid aladin,0,everyone kolkata covid aladin
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories,1,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year.",2020 crazy year bought house 27 partner got married got covid rollercoaster year,-1,2020 crazy year bought house 27 partner got married got covid rollercoaster year
...,...,...,...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment.",course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment,-1,course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment
51996,French President Emmanuel Macron tests positive for coronavirus - National |,french president emmanuel macron tests positive coronavirus national,1,french president emmanuel macron tests positive coronavirus national
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker.",god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker,-1,god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs.",abandoning wells anyway ones taking advantage federal program dorin told global news interview words creating jobs,-1,abandoning wells anyway ones taking advantage federal program dorin told global news interview words creating jobs


In [35]:
root_tweet_clean[ root_tweet_clean['clean_text'] != root_tweet_clean['new_treat'] ]

Unnamed: 0,text,clean_text,class,new_treat
790,This.,,0,
1443,,,0,
4535,,,0,
4542,,,0,
5865,,,0,
5867,,,0,
6954,,,0,
7924,More of this?,,0,
7926,More of this?,,0,
7931,More of this?,,0,


In [36]:
root_tweet_clean.iloc[790]

text          This.
clean_text      NaN
class             0
new_treat       nan
Name: 790, dtype: object

In [37]:
original = pd.read_csv('Datos1_Kike/allTweets.csv')

In [39]:
print(original.iloc[51998])

text    "Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program," Dorin told Global News in an interview. "In other words we're not creating any jobs."\nhttps://t.co/OBtWAyVQND
Name: 51998, dtype: object


In [40]:
root_tweet_clean[
    root_tweet_clean['clean_text'].astype('str').str.contains('corona') |
    root_tweet_clean['clean_text'].astype('str').str.contains('cov')
    ]

Unnamed: 0,text,clean_text,class,new_treat
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland,coronavirus new deaths 727 new cases confirmed ireland,-1,coronavirus new deaths 727 new cases confirmed ireland
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can’t be anyway that you think the sectors you’ve mentioned have shaped the covid response/policy,definitely grenade tweet lobbed twittersphere people go mad atthere can’t anyway think sectors you’ve mentioned shaped covid responsepolicy,-1,definitely grenade tweet lobbed twittersphere people go mad atthere can’t anyway think sectors you’ve mentioned shaped covid responsepolicy
2,Everyone in Kolkata now is Covid Aladin.,everyone kolkata covid aladin,0,everyone kolkata covid aladin
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories,1,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year.",2020 crazy year bought house 27 partner got married got covid rollercoaster year,-1,2020 crazy year bought house 27 partner got married got covid rollercoaster year
...,...,...,...,...
51994,Covid19 cases are on the rise.We need everyone’s help to save lives &amp; livelihoods. Please stay home as much as possible and wear a mask if you must go out. Do not gather with other households,covid19 cases risewe need everyone’s help save lives amp livelihoods please stay home much possible wear mask must go gather households,1,covid19 cases risewe need everyone’s help save lives amp livelihoods please stay home much possible wear mask must go gather households
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment.",course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment,-1,course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment
51996,French President Emmanuel Macron tests positive for coronavirus - National |,french president emmanuel macron tests positive coronavirus national,1,french president emmanuel macron tests positive coronavirus national
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker.",god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker,-1,god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker


In [43]:
validation = pd.read_csv('Datos2/twtsForTensi1_out.txt',sep='\t')

In [44]:
rnd1 = random.randint(0,len(validation[validation.Overall == -1]))
mens1,etiqueta1 = validation.Text[validation.Overall == -1].iloc[rnd1],validation.Overall[validation.Overall == -1].iloc[rnd1]
pred1 = pipeline.predict([mens1])
print("Predicción 1 en validación", pred1)
print("Predicción 1 valor real", etiqueta1)

rnd2 = random.randint(0,len(validation[validation.Overall == 0]))
mens2,etiqueta2 = validation.Text[validation.Overall == 0].iloc[rnd2],validation.Overall[validation.Overall == 0].iloc[rnd2]
pred2 = pipeline.predict([mens2])
print("Predicción 1 en validación", pred2)
print("Predicción 1 valor real", etiqueta2)

rnd3 = random.randint(0,len(validation[validation.Overall == 1]))
mens3,etiqueta3 = validation.Text[validation.Overall == 1].iloc[rnd3],validation.Overall[validation.Overall == 1].iloc[rnd3]
pred3 = pipeline.predict([mens3])
print("Predicción 1 en validación", pred3)
print("Predicción 1 valor real", etiqueta3)

print(rnd1,rnd2, rnd3)

Predicción 1 en validación [-1]
Predicción 1 valor real -1
Predicción 1 en validación [0]
Predicción 1 valor real 0
Predicción 1 en validación [-1]
Predicción 1 valor real 1
3471 389 1268


## Saving, loading and putting to the test the Transformers and the Model

In [45]:
pickle.dump(tfidf_transformer, open("tfidf_AnxieTweet.pickle", "wb"))

In [46]:
tfidf_saved = pickle.load(open("tfidf_AnxieTweet.pickle", "rb"))

In [59]:
test_text = "@Anti_SS_69 @alan_gable @cathymg8820 @Kelleyrose20 @NotThisPussyCat @garym27 @AnneJohnson12 @LetsGetBusy_ @ThatsJaimeLeigh @OlivviaDay @Beurybarbara Funny thing is; it's the Cult of Trump Devotees most at risk of catching the Covid-19; due to their extreme ignorance!"

In [48]:
pickle.dump(bow_transformer, open("bow_AnxieTweet.pickle", "wb"))

In [49]:
bow_saved = pickle.load(open("bow_AnxieTweet.pickle", "rb"))

In [60]:
print(bow_saved.transform([test_text]))

  (0, 1030)	1
  (0, 6714)	1
  (0, 11145)	1
  (0, 14513)	1
  (0, 15724)	1
  (0, 17438)	1
  (0, 19044)	1
  (0, 21314)	1
  (0, 23562)	1
  (0, 27764)	1
  (0, 29304)	1
  (0, 29432)	1
  (0, 36259)	1
  (0, 38757)	2
  (0, 46265)	1
  (0, 53524)	2
  (0, 53633)	1
  (0, 53869)	1
  (0, 54429)	1
  (0, 55398)	1


In [51]:
print(bow_transformer.transform([test_text]))

  (0, 5522)	1
  (0, 11553)	1
  (0, 13065)	1
  (0, 13199)	1
  (0, 15334)	1
  (0, 18274)	1
  (0, 22853)	1
  (0, 27211)	1
  (0, 28777)	1
  (0, 28961)	1
  (0, 32916)	1
  (0, 39454)	1
  (0, 39929)	1
  (0, 50169)	1
  (0, 53633)	1
  (0, 54429)	1
  (0, 54507)	1


In [61]:
print(tfidf_saved.transform(bow_saved.transform([test_text])))

  (0, 55398)	0.1224913145559297
  (0, 54429)	0.2060506485171485
  (0, 53869)	0.14683811473007946
  (0, 53633)	0.25675002215927445
  (0, 53524)	0.334257663708377
  (0, 46265)	0.1463484012046026
  (0, 38757)	0.42701940581308634
  (0, 36259)	0.27047423163853973
  (0, 29432)	0.11753248957716222
  (0, 29304)	0.2114420120916078
  (0, 27764)	0.22635295736657304
  (0, 23562)	0.1951571306094435
  (0, 21314)	0.20773036550081206
  (0, 19044)	0.1252414461842721
  (0, 17438)	0.2890968235043007
  (0, 15724)	0.23213229477230418
  (0, 14513)	0.047631360470160644
  (0, 11145)	0.19801799256141545
  (0, 6714)	0.2409581218650738
  (0, 1030)	0.1507485089894769


In [53]:
print(tfidf_transformer.transform(bow_saved.transform([test_text])))

  (0, 54507)	0.1890760637726648
  (0, 54429)	0.23808326112397696
  (0, 53633)	0.296664354173319
  (0, 50169)	0.25441498932444295
  (0, 39929)	0.31818201225344983
  (0, 39454)	0.2473290604410699
  (0, 32916)	0.1857665331400165
  (0, 28961)	0.2078118540986061
  (0, 28777)	0.3035914928454932
  (0, 27211)	0.2605561766852315
  (0, 22853)	0.2486227233997797
  (0, 18274)	0.26255967301297206
  (0, 15334)	0.22337537515427633
  (0, 13199)	0.20558740723951802
  (0, 13065)	0.17109980680961404
  (0, 11553)	0.22004610726212853
  (0, 5522)	0.23250186833449943


In [54]:
stress_detect_model.predict(tfidf_transformer.transform(bow_saved.transform([test_text])))

array([-1])

In [55]:
pickle.dump(stress_detect_model, open("anxieTweet_Prediction_Model.pickle", "wb"))

In [56]:
svd_Pred_Mdl = pickle.load(open("anxieTweet_Prediction_Model.pickle", "rb"))

In [57]:
svd_Pred_Mdl.predict(tfidf_transformer.transform(bow_saved.transform([test_text])))

array([-1])

In [58]:
print(tfidf_transformer.transform(bow_saved.transform([test_text])))

  (0, 54507)	0.1890760637726648
  (0, 54429)	0.23808326112397696
  (0, 53633)	0.296664354173319
  (0, 50169)	0.25441498932444295
  (0, 39929)	0.31818201225344983
  (0, 39454)	0.2473290604410699
  (0, 32916)	0.1857665331400165
  (0, 28961)	0.2078118540986061
  (0, 28777)	0.3035914928454932
  (0, 27211)	0.2605561766852315
  (0, 22853)	0.2486227233997797
  (0, 18274)	0.26255967301297206
  (0, 15334)	0.22337537515427633
  (0, 13199)	0.20558740723951802
  (0, 13065)	0.17109980680961404
  (0, 11553)	0.22004610726212853
  (0, 5522)	0.23250186833449943
