# Preprocessing sentiment140-data that is used to train and test the models

importing all necessary libraries

In [137]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

setting to show full text content

## importing data and first edits

In [138]:
pd.set_option('display.max_colwidth', None)

importing sentiment140 dataset without header and first analyzation 

In [139]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding="latin1", header=None)

In [140]:
df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


dropping unnecessary columns, only keeping target (as label) and text (as feature)

In [141]:
df = df.drop([1, 2,3, 4], axis =1)

In [142]:
new_header = ['target', 'Text']
df.columns = new_header

In [143]:
df.head()

Unnamed: 0,target,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [144]:
df['Text']

0          @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
1              is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
2                                    @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
3                                                                              my whole body feels itchy and like its on fire 
4              @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
                                                                  ...                                                         
1599995                                                               Just woke up. Having no school is the best feeling ever 
1599996                                         TheWDB.com - Very cool to hear old Walt interviews!  â« http:/

watching for duplicates

In [145]:
duplicates = df[df['Text'].duplicated(keep=False)]

duplicates["target"].value_counts()

target
0    15683
4    11285
Name: count, dtype: int64

## building vectorizer

vectorizer created and exported (needed for some models)

In [146]:
# texts have to be in number format and must therefore be vectorized
vectorizer = TfidfVectorizer()

# astype(str) is necesarry, since otherwise some fields get interpreted as floats
vectorizer.fit(df['Text'].astype(str))

# exporting created vectorizer in pkl format
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

### Ground-Truth-Label

adding Ground-Truth-Label to dataset to later use this column to compare predictions from the models and pre-defined labels

In [147]:
def map_target_to_label(target):
    if target == 4:
        return 'positive'
    elif target == 0:
        return 'negative'
    else:
        return None

In [148]:
df['Ground_Truth_Label'] = df['target'].apply(map_target_to_label)

## testdata and traindata splitting

counting rows of target column to later confirm correct splitting 0 - negative 4 - positive

In [149]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [150]:
x = df[['Text', 'Ground_Truth_Label']]
y = df['target']

In [151]:
feature_train, feature_test, label_train, label_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [152]:
label_train_df = pd.DataFrame(label_train, columns=['target'])
train_data = pd.concat([feature_train, label_train_df], axis=1)

In [153]:
label_test_df = pd.DataFrame(label_test, columns=['target'])
test_data = pd.concat([feature_test, label_test_df], axis=1)

#### Dev Set Splitting

the Dev Set is used for Hyperparametertuning for ML-Based Approaches

In [154]:
feature = train_data[['Text', 'Ground_Truth_Label']]
label = train_data['target']

In [155]:
feature_train, feature_dev, label_train, label_dev = train_test_split(feature, label, test_size=0.24, random_state=1, stratify=label)

In [156]:
label_train_df = pd.DataFrame(label_train, columns=['target'])
train_data = pd.concat([feature_train, label_train_df], axis=1)

label_dev_df = pd.DataFrame(label_dev, columns=['target'])
dev_data = pd.concat([feature_dev, label_dev_df], axis=1)

comparing rows of train_data to confirm correct splitting

In [157]:
train_data

Unnamed: 0,Text,Ground_Truth_Label,target
711720,@ddlovato wow Not wise of your fear of the elevators incredible! i love you demi !,negative,0
315052,"@rimamelati hahaha! i'm so so bored. what are you doing? haha, the plane crash is so so scary",negative,0
1443071,"@BrendaSanDiego Sure, i'm following you now",positive,4
555282,"@yum9me does your itunes say that you can download OS3 yet, because mine says my OS is up to date",negative,0
153944,@PetiteAntoin my mom's French Mauritian and I can't speak French Good luck learning it,negative,0
...,...,...,...
552842,@yelyahwilliams I asked 400 of the staff at ACC to bring my cd back to get signed but they all said no Please Please sign my cd for me?,negative,0
1068505,@arosefull love it. i am a sucker for the romantic comedy. when harry met sally is my favorite movie.,positive,4
1407334,@LisaWorld thx for the props &amp; u have a new listener â« http://blip.fm/~7quc9,positive,4
199494,"@ddlovato yesterday &quot;sonny with a chance&quot; came to brazil, i loved it!! you're amazing &lt;33 please reply",negative,0


In [158]:
train_data['target'].value_counts()

target
0    486400
4    486400
Name: count, dtype: int64

comparing rows of test_data to confirm correct splitting

In [159]:
test_data

Unnamed: 0,Text,Ground_Truth_Label,target
200373,would like to have even a little bit of time off work. Laying out until 11:30 then work 12:30-8:30.,negative,0
1279087,Looking forward to keeping in touch with Naomi.. It has been a long time since we chatted!,positive,4
44625,"@NBATVAndre No, in Germany the NBA has no airtime . But I got the International LP so I will watch the game for sure.",negative,0
1353688,"@AliChemist Your future self's in trouble, then! My favourite Bagley is probably Running Blind, in Iceland. Terrific.",positive,4
1335350,@remco_dekker did you know there is a reason why Opera is only popular the mobile platform..? It's because Firefox rocks its ass!,positive,4
...,...,...,...
1144572,@ttmhand ???????,positive,4
796530,had no idea on what to wear tomorrow !,negative,0
171197,@sweet_pea00 darn i will think of you on our journey so it's like you are really there,negative,0
1322000,"oh my, Calvin Harris called me a DAFT BASTARD. I feel so cool, this is my new claim to fame.",positive,4


In [160]:
test_data["target"].value_counts()

target
0    160000
4    160000
Name: count, dtype: int64

## export preprocessed data without stopword-cleaning

exporting testdata

In [161]:
test_data.to_csv('testdata_with_stopwords_preprocessed.csv', index=False)

exporting traindata

In [162]:
train_data.to_csv('traindata_with_stopwords_preprocessed.csv', index=False)

exporting dev set data

In [163]:
dev_data.to_csv('devdata_with_stopwords_preprocessed.csv', index=False)

## stopword-cleaning

clean_text function to delete unnecessary symbols, words and punctuations that don't add specific sentiment value to the text to compare the performancee to other datasets where the stopword-cleaning has not been performed

In [164]:
def clean_text(text):
    stop_words = stopwords.words('english')
    text = re.sub(r'[@#]\w+', ' ', text)  
    text = re.sub(r'https?://\S+', ' ', text)  
    text = re.sub(r"\b\w+['’]\w+\b", ' ', text)  
    text = re.sub(rf"[{punctuation}]", ' ', text)  
    words = text.split()  
    words = [word for word in words if word.lower() not in stop_words]  
    return ' '.join(words)

In [171]:
test_data_without_stopwords = test_data
train_data_without_stopwords = train_data 
dev_data_without_stopwords = dev_data 

In [166]:
test_data_without_stopwords['Text'] = test_data_without_stopwords['Text'].apply(clean_text)

In [167]:
test_data_without_stopwords

Unnamed: 0,Text,Ground_Truth_Label,target
200373,would like even little bit time work Laying 11 30 work 12 30 8 30,negative,0
1279087,Looking forward keeping touch Naomi long time since chatted,positive,4
44625,Germany NBA airtime got International LP watch game sure,negative,0
1353688,future trouble favourite Bagley probably Running Blind Iceland Terrific,positive,4
1335350,know reason Opera popular mobile platform Firefox rocks ass,positive,4
...,...,...,...
1144572,,positive,4
796530,idea wear tomorrow,negative,0
171197,darn think journey like really,negative,0
1322000,oh Calvin Harris called DAFT BASTARD feel cool new claim fame,positive,4


In [168]:
train_data_without_stopwords['Text'] = train_data_without_stopwords['Text'].apply(clean_text)

In [169]:
train_data_without_stopwords

Unnamed: 0,Text,Ground_Truth_Label,target
711720,wow wise fear elevators incredible love demi,negative,0
315052,hahaha bored haha plane crash scary,negative,0
1443071,Sure following,positive,4
555282,itunes say download OS3 yet mine says OS date,negative,0
153944,French Mauritian speak French Good luck learning,negative,0
...,...,...,...
552842,asked 400 staff ACC bring cd back get signed said Please Please sign cd,negative,0
1068505,love sucker romantic comedy harry met sally favorite movie,positive,4
1407334,thx props amp u new listener â«,positive,4
199494,yesterday quot sonny chance quot came brazil loved amazing lt 33 please reply,negative,0


In [172]:
dev_data_without_stopwords['Text'] = dev_data_without_stopwords['Text'].apply(clean_text)

In [173]:
dev_data

Unnamed: 0,Text,Ground_Truth_Label,target
895910,awesome bbq boys girls,positive,4
1475700,Sleepover like 12 almost 1 talking txting,positive,4
632752,bf weave haha sometimes drivers see us move pass us REALLY closely scary,negative,0
1146871,Seeing everyone,positive,4
633427,going sleeeeep super tired miss daddy gone one day,negative,0
...,...,...,...
1501743,first time know long client work weekend Woo Hoo 2 days,positive,4
1390186,party tonight drunkness haha x,positive,4
397195,love Bodrum although many people lack tourists right Sad Getting ready Antalya Sydney yeay,negative,0
1582223,hate working office smells like poo knows things know however love dunking donuts,positive,4


## export preprocessed data with stopword-cleaning

In [174]:
test_data.to_csv('testdata_without_stopwords_preprocessed.csv', index=False)

In [175]:
train_data.to_csv('traindata_without_stopwords_preprocessed.csv', index=False)

In [176]:
dev_data.to_csv('devdata_without_stopwords_preprocessed.csv', index=False)