In [32]:
import pandas as pd # for data processing
import string # collection of alphabets, words or other characters
import re # regular expression support
import nltk
from nltk.corpus import stopwords
from string import punctuation 
from sklearn.model_selection import train_test_split


from sklearn.feature_extraction.text import TfidfVectorizer # to transfrom the text into numbers
import pickle # to export the model for training/testing

In [33]:
pd.set_option('display.max_colwidth', None)

In [34]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None)

In [4]:
df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! ♫ http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


In [37]:
df = df.drop([1, 2,3, 4], axis =1)

In [38]:
new_header = ['target', 'Text']
df.columns = new_header

In [39]:
df.head()

Unnamed: 0,target,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [8]:
df['Text']

0          @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
1              is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
2                                    @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
3                                                                              my whole body feels itchy and like its on fire 
4              @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
                                                                  ...                                                         
1599995                                                               Just woke up. Having no school is the best feeling ever 
1599996                                           TheWDB.com - Very cool to hear old Walt interviews!  ♫ http:/

## Vectorizer created and exported

In [41]:
# texts have to be in number format and must therefore be vectorized
vectorizer = TfidfVectorizer()
# astype(str) is necesarry 
vectorizer.fit(df['Text'].astype(str))

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

## Anzahl der Labels anzeigen 0 - negative, 4 - positive 

In [9]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

## In Test- und Trainingsdatensatz splitten

In [10]:
X = df['Text']
y = df['target']

In [11]:
feature_train, feature_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [12]:
train_data = pd.concat([feature_train, label_train], axis=1)

In [13]:
test_data = pd.concat([feature_test, label_test], axis=1)

### Adding Ground Truth Label

In [14]:
def map_target_to_label(target):
    if target == 4:
        return 'positive'
    elif target == 0:
        return 'negative'
    else:
        return None

In [15]:
train_data['Ground_Truth_Label'] = train_data['target'].apply(map_target_to_label)

In [16]:
test_data['Ground_Truth_Label'] = test_data['target'].apply(map_target_to_label)

In [17]:
train_data

Unnamed: 0,Text,target,Ground_Truth_Label
999032,Rise and shining lol. I'm up a few mins later then planned but no rush i'll be out the door in time,4,positive
1045663,Had a 2nd interview today. It's looking promising.,4,positive
302336,@EmilyAlbracht I feel your pain!,0,negative
1427382,http://twitpic.com/6sdbj - 4yr old son and I went exploring &amp; hiking today and this is what we found Luv amphibians!,4,positive
64747,is lonely because all my housemates have gone to the pub,0,negative
...,...,...,...
1127399,Being held together by bubblegum &amp; blessings,4,positive
884337,Heehee! I was one of them,4,positive
989044,@bookwitter your welcome if you change your mind though let me know,4,positive
1581849,@howcoza YOU BET I WILL Bring Backup!,4,positive


In [18]:
train_data['target'].value_counts()

target
4    640000
0    640000
Name: count, dtype: int64

In [19]:
test_data

Unnamed: 0,Text,target,Ground_Truth_Label
200373,would like to have even a little bit of time off work. Laying out until 11:30 then work 12:30-8:30.,0,negative
1279087,Looking forward to keeping in touch with Naomi.. It has been a long time since we chatted!,4,positive
44625,"@NBATVAndre No, in Germany the NBA has no airtime . But I got the International LP so I will watch the game for sure.",0,negative
1353688,"@AliChemist Your future self's in trouble, then! My favourite Bagley is probably Running Blind, in Iceland. Terrific.",4,positive
1335350,@remco_dekker did you know there is a reason why Opera is only popular the mobile platform..? It's because Firefox rocks its ass!,4,positive
...,...,...,...
1144572,@ttmhand ???????,4,positive
796530,had no idea on what to wear tomorrow !,0,negative
171197,@sweet_pea00 darn i will think of you on our journey so it's like you are really there,0,negative
1322000,"oh my, Calvin Harris called me a DAFT BASTARD. I feel so cool, this is my new claim to fame.",4,positive


In [20]:
test_data["target"].value_counts()

target
0    160000
4    160000
Name: count, dtype: int64

### Export Preprocessed Data without stopword-cleaning

In [21]:
test_data.to_csv('testdata_with_stopwords_preprocessed.csv', index=False)

In [22]:
train_data.to_csv('traindata_with_stopwords_preprocessed.csv', index=False)

## Stopword-cleaning

In [23]:
def clean_text(text):
    stop_words = stopwords.words('english')
    text = re.sub(r'[@#]\w+', ' ', text)  # Entfernen von Hashtags und Verlinkungen 
    text = re.sub(r'https?://\S+', ' ', text)  # Entfernen von Links
    text = re.sub(r"\b\w+['’]\w+\b", ' ', text)  # Entfernen von Kontraktionen wie 's, 'm etc. 
    text = re.sub(rf"[{punctuation}]", ' ', text)  # Entfernen von Satzzeichen
    words = text.split()  
    words = [word for word in words if word.lower() not in stop_words]  
    return ' '.join(words)

In [24]:
test_data_without_stopwords = test_data
train_data_without_stopwords = train_data 

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/I569423/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
test_data_without_stopwords['Text'] = test_data_without_stopwords['Text'].apply(clean_text)

In [27]:
test_data_without_stopwords

Unnamed: 0,Text,target,Ground_Truth_Label
200373,would like even little bit time work Laying 11 30 work 12 30 8 30,0,negative
1279087,Looking forward keeping touch Naomi long time since chatted,4,positive
44625,Germany NBA airtime got International LP watch game sure,0,negative
1353688,future trouble favourite Bagley probably Running Blind Iceland Terrific,4,positive
1335350,know reason Opera popular mobile platform Firefox rocks ass,4,positive
...,...,...,...
1144572,,4,positive
796530,idea wear tomorrow,0,negative
171197,darn think journey like really,0,negative
1322000,oh Calvin Harris called DAFT BASTARD feel cool new claim fame,4,positive


In [28]:

train_data_without_stopwords['Text'] = train_data_without_stopwords['Text'].apply(clean_text)

In [29]:
train_data_without_stopwords

Unnamed: 0,Text,target,Ground_Truth_Label
999032,Rise shining lol mins later planned rush door time,4,positive
1045663,2nd interview today looking promising,4,positive
302336,feel pain,0,negative
1427382,4yr old son went exploring amp hiking today found Luv amphibians,4,positive
64747,lonely housemates gone pub,0,negative
...,...,...,...
1127399,held together bubblegum amp blessings,4,positive
884337,Heehee one,4,positive
989044,welcome change mind though let know,4,positive
1581849,BET Bring Backup,4,positive


### Export Data with stopword-cleaning

In [30]:
test_data.to_csv('testdata_without_stopwords_preprocessed.csv', index=False)

In [31]:
train_data.to_csv('traindata_without_stopwords_preprocessed.csv', index=False)