In [1]:
import pandas as pd
import re
import string


import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)


## Data Loading

In [2]:
tweets_csv = pd.read_csv("Datos1_Kike/allTweets.csv")
tweets_csv

Unnamed: 0,text
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland https://t.co/Mf2alfOvUS
1,@saoirse_mchugh This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.\n\nThere can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy
2,Everyone in Kolkata now is Covid Aladin.
3,Becky's mother hasn't held her granddaughter yet.\n\nBalor's father went through COVID-19 and triple heart bypass surgery.\n\nBoth parents are IT to each of them.\n\nAnd I am sure there's a great many stories more.
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year."
...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment. https://t.co/iCJBMXVJJy"
51996,French President Emmanuel Macron tests positive for coronavirus - National | https://t.co/1gRppXhjel https://t.co/vKDazSWZyr
51997,"My god. Will somebody please help @marcorubio untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker. https://t.co/DK8dpI3Hwo"
51998,"""Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program,"" Dorin told Global News in an interview. ""In other words we're not creating any jobs.""\nhttps://t.co/OBtWAyVQND"


## Removing some special characters, brakelines, tabs...

In [3]:
tweets_csv = twetts_csv.replace(['\n','\t','\"','\(','\)'],'', regex=True)

In [4]:
tweets_csv

Unnamed: 0,text
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland https://t.co/Mf2alfOvUS
1,@saoirse_mchugh This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy
2,Everyone in Kolkata now is Covid Aladin.
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year."
...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment. https://t.co/iCJBMXVJJy"
51996,French President Emmanuel Macron tests positive for coronavirus - National | https://t.co/1gRppXhjel https://t.co/vKDazSWZyr
51997,"My god. Will somebody please help @marcorubio untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker. https://t.co/DK8dpI3Hwo"
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs.https://t.co/OBtWAyVQND"


## Removing urls or lavels

Such information is considered not relevant or spurious

In [5]:
def removeUrlsAndLabeling(string):
    strAslist = string.split(' ')
    for i in strAslist:
        if i.startswith('http') or i.startswith('https') or i.startswith('@'):
            strAslist.pop(strAslist.index(i))
    result = ' '.join(strAslist)
    result = re.split('http:\/\/.*', str(result))[0]
    result = re.split('https:\/\/.*', str(result))[0]
        
    return(result)

In [6]:
tweets_csv['text'] = tweets_csv['text'].apply(removeUrlsAndLabeling)

In [7]:
tweets_csv

Unnamed: 0,text
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy
2,Everyone in Kolkata now is Covid Aladin.
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year."
...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment."
51996,French President Emmanuel Macron tests positive for coronavirus - National |
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker."
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs."


In [8]:
tweets_csv['text'].astype('str').str.contains('http')

0        False
1        False
2        False
3        False
4        False
         ...  
51995    False
51996    False
51997    False
51998    False
51999    False
Name: text, Length: 52000, dtype: bool

## Cleaning the tweet

In [9]:
def text_process(mess):
    """
    1. remove punc
    2. remove stop words
    3. return list of clean words
    """
    
    nopunc = [char.lower() for char in mess if char not in string.punctuation + "‚Äú‚Äù"]
    nopunc = ''.join(nopunc)
    nopunc = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return nopunc

In [10]:
tweets_csv['clean_text']=twetts_csv.text.apply(text_process)
tweets_csv['clean_text'] = [' '.join(map(str, l)) for l in tweets_csv['clean_text']]

In [11]:
tweets_csv

Unnamed: 0,text,clean_text
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland,coronavirus new deaths 727 new cases confirmed ireland
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy,definitely grenade tweet lobbed twittersphere people go mad atthere can‚Äôt anyway think sectors you‚Äôve mentioned shaped covid responsepolicy
2,Everyone in Kolkata now is Covid Aladin.,everyone kolkata covid aladin
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year.",2020 crazy year bought house 27 partner got married got covid rollercoaster year
...,...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment.",course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment
51996,French President Emmanuel Macron tests positive for coronavirus - National |,french president emmanuel macron tests positive coronavirus national
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker.",god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs.",abandoning wells anyway ones taking advantage federal program dorin told global news interview words creating jobs


In [12]:
tweets_csv['text'].to_csv('Datos1_Kike/cleanTweetsForTensi.txt',header=None, index=None, sep=' ', mode='w')

In [13]:
labels = pd.read_csv('Datos1_Kike/cleanTweetsForTensi2_out.txt',sep='\t')
labels

Unnamed: 0,Overall,Text
0,-1,Coronavirus: No new deaths and 727 new cases confirmed in Ireland
1,-1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy
2,0,Everyone in Kolkata now is Covid Aladin.
3,1,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.
4,-1,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year."
...,...,...
51995,-1,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment."
51996,1,French President Emmanuel Macron tests positive for coronavirus - National |
51997,-1,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker."
51998,-1,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs."


In [14]:
labels[labels['Overall']==-1]

Unnamed: 0,Overall,Text
0,-1,Coronavirus: No new deaths and 727 new cases confirmed in Ireland
1,-1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy
4,-1,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year."
6,-1,The mutant covid-19 strain in the U.K. has now appeared in about 5 countries. Is 2021 becoming 2020 all over again??ü§î
8,-1,"One whole month, covid deprived us of our last few months with our nana, and I wish I had of known when I saw her it‚Äôd be the last time, that face is in my heart forever üíñ"
...,...,...
51987,-1,Martyn Brown: Horgan‚Äôs callous cut to the COVID income support supplement is unworthy of an NDP government #BritishColumbia #Vancouver #BCNDP #BChealth #BCpoli #COVID19BC #COVID19Vancouver
51991,-1,@GWR1971 It does matter to me since I have a loved one who is currently very sick with covid and I would like for others to not have to go through this.
51995,-1,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment."
51997,-1,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker."


## Joining text with respective calss label

In [15]:
tweets_csv['class']= labels['Overall']

In [16]:
tweets_csv

Unnamed: 0,text,clean_text,class
0,Coronavirus: No new deaths and 727 new cases confirmed in Ireland,coronavirus new deaths 727 new cases confirmed ireland,-1
1,This was definitely a Grenade tweet just lobbed into twittersphere for people to go mad at.There can‚Äôt be anyway that you think the sectors you‚Äôve mentioned have shaped the covid response/policy,definitely grenade tweet lobbed twittersphere people go mad atthere can‚Äôt anyway think sectors you‚Äôve mentioned shaped covid responsepolicy,-1
2,Everyone in Kolkata now is Covid Aladin.,everyone kolkata covid aladin,0
3,Becky's mother hasn't held her granddaughter yet.Balor's father went through COVID-19 and triple heart bypass surgery.Both parents are IT to each of them.And I am sure there's a great many stories more.,beckys mother hasnt held granddaughter yetbalors father went covid19 triple heart bypass surgeryboth parents themand sure theres great many stories,1
4,"2020 has been crazy year, bought a house at 27 with my partner, Got married and Got covid. it's been a rollercoaster year.",2020 crazy year bought house 27 partner got married got covid rollercoaster year,-1
...,...,...,...
51995,"Of course, because who doesn't love a leader who failed to prevent a second COVID-19 wave he knew was coming? It's not like other countries have gotten a handle on this pandemic, proving it's possible while exposing Canada's response as a woefully inadequate deadly embarrassment.",course doesnt love leader failed prevent second covid19 wave knew coming like countries gotten handle pandemic proving possible exposing canadas response woefully inadequate deadly embarrassment,-1
51996,French President Emmanuel Macron tests positive for coronavirus - National |,french president emmanuel macron tests positive coronavirus national,1
51997,"My god. Will somebody please help untwist his shorts? You sit on your fucking hands instead of helping Americans deal with Covid, you tolerate Trump's racism and sexism but you get bent out of shape because you're called a fucker? You fucking #fucker.",god somebody please help untwist shorts sit fucking hands instead helping americans deal covid tolerate trumps racism sexism get bent shape youre called fucker fucking fucker,-1
51998,"Those who were abandoning wells anyway [are] the ones who are taking advantage of this federal program, Dorin told Global News in an interview. In other words we're not creating any jobs.",abandoning wells anyway ones taking advantage federal program dorin told global news interview words creating jobs,-1


## Saving final csv to use it in the model training part

In [17]:
tweets_csv.to_csv('Datos1_Kike/tweets_cleaned_labeled.csv', index=False)