In [6]:
import numpy as np # for array, linear algebra
import pandas as pd # for data processing
import matplotlib.pyplot as plt # to crreate stat and visualize data
import matplotlib as mpl  # to visualize data
import seaborn as sns # to visualize data
import matplotlib.colors as mcolors # to visualize colors
import string # collection of alphabets, words or other characters
import re # regular expression support
import nltk 
from nltk.corpus import stopwords
# import gensim # representing documents as semantic vectors

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score

In [7]:
pd.set_option('display.max_colwidth', None)

## Vader with Text-Pre-Edditing

In [8]:
df_vader_without_stopwords = pd.read_csv('../../Data-Preperation/preprocessed_dataset.csv')

### Satzteile und Stopwords entfernen, die keinen Sentiment-Score aufweisen

In [9]:
df_vader_without_stopwords

Unnamed: 0,target,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...
1599995,4,Just woke up. Having no school is the best feeling ever
1599996,4,TheWDB.com - Very cool to hear old Walt interviews! ♫ http://blip.fm/~8bmta
1599997,4,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


In [10]:
stop_words = stopwords.words('english')

In [11]:
def clean_text(text):
    text = re.sub(r'[@#]\w+', ' ', text)  # Entfernen von Hashtags und 
    text = re.sub(r'https?://\S+', ' ', text)  # Entfernen von Links
    text = re.sub(r"\b\w+['’]\w+\b", ' ', text)  # Entfernen von Kontraktionen wie 's, 'm etc. 
    words = text.split()  
    words = [word for word in words if word.lower() not in stop_words]  
    return ' '.join(words)
df_vader_without_stopwords['Text'] = df_vader_without_stopwords['Text'].apply(clean_text)

In [12]:
df_vader_without_stopwords

Unnamed: 0,target,Text
0,0,"- Awww, bummer. shoulda got David Carr Third Day it. ;D"
1,0,upset update Facebook texting it... might cry result School today also. Blah!
2,0,dived many times ball. Managed save 50% rest go bounds
3,0,whole body feels itchy like fire
4,0,"no, behaving all. mad. here? see there."
...,...,...
1599995,4,woke up. school best feeling ever
1599996,4,TheWDB.com - cool hear old Walt interviews! ♫
1599997,4,ready MoJo Makeover? Ask details
1599998,4,Happy 38th Birthday boo alll time!!! Tupac Amaru Shakur


#### controlling count

### Create a new column 'Ground_Truth_Label' based on the 'target' values

In [13]:
def map_target_to_label(target):
    if target == 4:
        return 'positive'
    elif target == 0:
        return 'negative'
    else:
        return None

In [14]:
df_vader_without_stopwords['Ground_Truth_Label'] = df_vader_without_stopwords['target'].apply(map_target_to_label)

In [15]:
df_vader_without_stopwords

Unnamed: 0,target,Text,Ground_Truth_Label
0,0,"- Awww, bummer. shoulda got David Carr Third Day it. ;D",negative
1,0,upset update Facebook texting it... might cry result School today also. Blah!,negative
2,0,dived many times ball. Managed save 50% rest go bounds,negative
3,0,whole body feels itchy like fire,negative
4,0,"no, behaving all. mad. here? see there.",negative
...,...,...,...
1599995,4,woke up. school best feeling ever,positive
1599996,4,TheWDB.com - cool hear old Walt interviews! ♫,positive
1599997,4,ready MoJo Makeover? Ask details,positive
1599998,4,Happy 38th Birthday boo alll time!!! Tupac Amaru Shakur,positive


In [16]:
df_vader_without_stopwords['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [17]:
df_vader_without_stopwords

Unnamed: 0,target,Text,Ground_Truth_Label
0,0,"- Awww, bummer. shoulda got David Carr Third Day it. ;D",negative
1,0,upset update Facebook texting it... might cry result School today also. Blah!,negative
2,0,dived many times ball. Managed save 50% rest go bounds,negative
3,0,whole body feels itchy like fire,negative
4,0,"no, behaving all. mad. here? see there.",negative
...,...,...,...
1599995,4,woke up. school best feeling ever,positive
1599996,4,TheWDB.com - cool hear old Walt interviews! ♫,positive
1599997,4,ready MoJo Makeover? Ask details,positive
1599998,4,Happy 38th Birthday boo alll time!!! Tupac Amaru Shakur,positive


In [18]:
analyzer = SentimentIntensityAnalyzer()

In [19]:
def classify_sentiment(text):
    score = analyzer.polarity_scores(text)['compound']
    if score >= 0.00:
        return 'positive'
    else: 
        return 'negative'

In [20]:
df_vader_without_stopwords['VADER_Classification'] = df_vader_without_stopwords['Text'].apply(classify_sentiment)

In [21]:
df_vader_without_stopwords

Unnamed: 0,target,Text,Ground_Truth_Label,VADER_Classification
0,0,"- Awww, bummer. shoulda got David Carr Third Day it. ;D",negative,negative
1,0,upset update Facebook texting it... might cry result School today also. Blah!,negative,negative
2,0,dived many times ball. Managed save 50% rest go bounds,negative,positive
3,0,whole body feels itchy like fire,negative,negative
4,0,"no, behaving all. mad. here? see there.",negative,negative
...,...,...,...,...
1599995,4,woke up. school best feeling ever,positive,positive
1599996,4,TheWDB.com - cool hear old Walt interviews! ♫,positive,positive
1599997,4,ready MoJo Makeover? Ask details,positive,positive
1599998,4,Happy 38th Birthday boo alll time!!! Tupac Amaru Shakur,positive,positive


In [22]:
accuracy = accuracy_score(df_vader_without_stopwords['Ground_Truth_Label'], df_vader_without_stopwords['VADER_Classification'])
print(f'Accuracy: {accuracy}')

Accuracy: 0.64188
