
# <font color=yellow><div align="center">Data processing</div></font>

# Data importation

In [41]:
import pandas as pd
import numpy as np 
import math as mt 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


## <font color=red><div align="left">Training Data</div></font>

In [42]:
training_path = "../tweet-sentiment-extraction/train.csv"
Train_df = pd.read_csv(training_path)

In [43]:
Train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


## <font color= "red"><div align = "left"> Test Data</div></font>

In [44]:
test_path = "../tweet-sentiment-extraction/test.csv"
Test_df = pd.read_csv(test_path)

In [45]:
Test_df

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive



<font color= "yellow"><div align = "left"> Checking if there are NAN values in the data</div></font>

In [46]:
Train_df[Train_df.isna().any(axis = 1)]

Unnamed: 0,textID,text,selected_text,sentiment
314,fdb77c3752,,,neutral


In [47]:
Train_df = Train_df.dropna()

In [48]:
Test_df[Test_df.isna().any(axis = 1)]

Unnamed: 0,textID,text,sentiment


<font color = "yellow"><div align = "left"> Keeping the intersting rows (getting rid of textID) </div></font>

In [49]:
Train_df = Train_df.drop(columns = ["textID"])

In [50]:
Train_df.head()

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [51]:
Test_df = Test_df.drop(columns = ["textID"])

In [52]:
Test_df.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


<font color = "yellow"><div align = "left"> Data processing (Stemming and Lemmatization) </div></font>

In [53]:
#stopwords represnt a dictionnary (english words)

nltk.download("stopwords")
stops = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mrabe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
#Tokenization function 

def tokens(words):
    words = re.sub("[^a-zA-Z]"," ", words)
    text = words.lower().split()
    return " ".join(text)

In [55]:
#Normalizing and cleaning the rows (Training Data)

Train_df["text_clear"] = Train_df["text"].apply(tokens)
Train_df["selected_text_clear"] = Train_df["selected_text"].apply(tokens)

In [56]:
#Normalizing and cleaning the rows (Test Data)

Test_df["text_clear"] = Test_df["text"].apply(tokens)

In [68]:
Train_df = Train_df.reset_index()
Train_df.head()

Unnamed: 0,index,text,selected_text,sentiment,text_clear,selected_text_clear
0,0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,i d have responded if i were going,i d have responded if i were going
1,1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad i will miss you here in san diego,sooo sad
2,2,my boss is bullying me...,bullying me,negative,my boss is bullying me,bullying me
3,3,what interview! leave me alone,leave me alone,negative,what interview leave me alone,leave me alone
4,4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,sons of why couldn t they put them on the rele...,sons of


In [69]:
Test_df = Test_df.reset_index()
Test_df.head()

Unnamed: 0,index,text,sentiment,text_clear
0,0,Last session of the day http://twitpic.com/67ezh,neutral,last session of the day http twitpic com ezh
1,1,Shanghai is also really exciting (precisely -...,positive,shanghai is also really exciting precisely sky...
2,2,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho she has to ...
3,3,happy bday!,positive,happy bday
4,4,http://twitpic.com/4w75p - I like it!!,positive,http twitpic com w p i like it


<font color = "green"><div align = "left"> Stemming</font></div>

In [76]:
#Stemming for training data

corpus_training=[]
for i in range(0,Train_df.shape[0]):
    Row = re.sub("[^a-zA-Z]"," ", Train_df["text"][i])
    Row=Row.lower()
    Row=Row.split()
    ps=PorterStemmer()
    Row=[ps.stem(word) for word in Row if not word in set(stops)]
    tocken=" ".join(Row)
    corpus_training.append(tocken)