In [None]:
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
# from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


In [21]:
import pandas as pd 
import string
import re

In [34]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [17]:
import nltk
nltk.download('vader_lexicon')

### PREPROCESS THE SENTIMENT140 DATASET

In [2]:
sentiment140_original = pd.read_csv('csv/training.1600000.processed.noemoticon.csv', engine='python', encoding='latin-1', header=None)

In [3]:
sentiment140 = sentiment140_original.copy()

In [4]:
sentiment140.columns = ['label', 'id', 'datetime', 'query', 'user', 'tweet']
sentiment140.head()

Unnamed: 0,label,id,datetime,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
sentiment140.label.value_counts()

0    800000
4    800000
Name: label, dtype: int64

In [7]:
sentiment140_labeledtweets = sentiment140[['tweet','label']]
sentiment140_labeledtweets.label = sentiment140_labeledtweets.label.replace({4:1})
sentiment140_labeledtweets.label.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0    800000
1    800000
Name: label, dtype: int64

In [8]:
sentiment140_tweets_positive = sentiment140_labeledtweets[sentiment140_labeledtweets.label==1]
sentiment140_tweets_negative = sentiment140_labeledtweets[sentiment140_labeledtweets.label==0]
sentiment140_tweets_positive = sentiment140_tweets_positive[:20000]
sentiment140_tweets_negative = sentiment140_tweets_negative[:20000]

In [9]:
sentiment140_posinega = pd.concat([sentiment140_tweets_negative, sentiment140_tweets_positive]).reset_index(drop=True)

In [18]:
stopwords_list = stopwords.words('english')
punctuations_list = string.punctuation
lemmatizer = nltk.stem.WordNetLemmatizer()

In [19]:
def clean_stopwords(tweet): 
    return ' '.join([word for word in tweet.split() if word not in stopwords_list])

def clean_url(tweet):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ', tweet)

def clean_atsigns(tweet):
    return re.sub('@\S*\s?',' ', tweet)

def clean_punctuation(tweet):
    translator = str.maketrans('','',punctuations_list)
    return tweet.translate(translator)

def clean_numbers(tweet):
    return re.sub('[0-9]+','',tweet)


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def pos_lemma_tweet(tweet):
    tokens = word_tokenize(tweet)
    pos_tags = nltk.pos_tag(tokens)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), pos_tags)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(lemmatized_sentence)


In [22]:
sentiment140_posinega.tweet = sentiment140_posinega.tweet.str.lower()
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda tweet: clean_stopwords(tweet))
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda t: clean_url(t))
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda t: clean_atsigns(t))
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda t: clean_punctuation(t))
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda t: clean_numbers(t))
sentiment140_posinega.tweet = sentiment140_posinega.tweet.apply(lambda t: pos_lemma_tweet(t))

In [23]:
sentiment140_posinega = sentiment140_posinega[sentiment140_posinega.tweet!='']

In [24]:
sentiment140_posinega[sentiment140_posinega.tweet.isna()]

Unnamed: 0,tweet,label


### FLAIR

In [25]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [25]:
def predict_flair(tweet):
    s = flair.data.Sentence(tweet)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    string_sentiment = str(total_sentiment[0])
    if string_sentiment.startswith('POSITIVE'):
        return 1
    else:
        return 0

In [30]:
sentiment140_posinega['predicted_label_flair'] = sentiment140_posinega.tweet.apply(lambda t: predict_flair(t))

In [32]:
y_test = sentiment140_posinega.label
y_pred = sentiment140_posinega.predicted_label_flair

In [35]:
print(confusion_matrix(y_test, y_pred))
print('auc score:', roc_auc_score(y_test, y_pred))
print('accuracy:', accuracy_score(y_test, y_pred))

[[13398  6541]
 [ 6791 13135]]
auc score: 0.66556922255356
accuracy: 0.665571303148125


## NLTK WITH VADER

In [37]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\maart\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [39]:
sid = SentimentIntensityAnalyzer()

In [42]:
def predict_nltk_vader(tweet):
    sid = SentimentIntensityAnalyzer()
    sentiment = sid.polarity_scores(tweet)['compound']
    if sentiment > 0:
        return 1
    else:
        return 0

In [43]:
sentiment140_posinega['predicted_label_nltk_vader'] = sentiment140_posinega.tweet.apply(lambda t: predict_nltk_vader(t))

In [44]:
y_test = sentiment140_posinega.label
y_pred = sentiment140_posinega.predicted_label_nltk_vader

In [45]:
print(confusion_matrix(y_test, y_pred))
print('auc score:', roc_auc_score(y_test, y_pred))
print('accuracy:', accuracy_score(y_test, y_pred))

[[12700  7239]
 [ 7495 12431]]
auc score: 0.6304004753895145
accuracy: 0.6304026088047159


### TEXTBLOB

In [46]:
from textblob import TextBlob

In [47]:
def predict_textblob(tweet):
    polarity = TextBlob(tweet).sentiment.polarity
    if polarity > 0:
        return 1
    else:
        return 0

In [48]:
sentiment140_posinega['predicted_label_textblob'] = sentiment140_posinega.tweet.apply(lambda t: predict_textblob(t))

In [49]:
y_test = sentiment140_posinega.label
y_pred = sentiment140_posinega.predicted_label_textblob

In [50]:
print(confusion_matrix(y_test, y_pred))
print('auc score:', roc_auc_score(y_test, y_pred))
print('accuracy:', accuracy_score(y_test, y_pred))

[[13797  6142]
 [ 9246 10680]]
auc score: 0.613971808535757
accuracy: 0.6139972406873198


### PREDICT ON TWINT TWEETS: BLACKMIRROR

In [51]:
tweets_blackmirror_df = pd.read_csv('csv/tweets.csv')

In [52]:
tweets_blackmirror_df = tweets_blackmirror_df[tweets_blackmirror_df.language=='en']
tweets_blackmirror_df.reset_index(drop=True, inplace=True)

In [53]:
tweets_blackmirror_df.language.value_counts()

en    9211
Name: language, dtype: int64

In [54]:
tweets_blackmirror_df.tweet

0       @caitoz @RealSexyCyborg It's all cute dog robo...
1       @BusinessInsider what else was Steve Jobs doin...
2       New black mirror season dropped  https://t.co/...
3       Be trying to remember the exact moment life be...
4       @Chris_Skinner Sounds like a Black Mirror nigh...
                              ...                        
9206    Okay so I really enjoyed all the seasons of Bl...
9207    y'all my San Junipero poster finally arrived!!...
9208    We really are in the current season of “Black ...
9209    @BigRedDawg_ @mtcbtc @gegelsmr2 Came to say ju...
9210    How many more episodes are there of the #Black...
Name: tweet, Length: 9211, dtype: object

In [56]:
tweets_blackmirror_df.tweet = tweets_blackmirror_df.tweet.str.lower()
tweets_blackmirror_hashtag = tweets_blackmirror_df[tweets_blackmirror_df.tweet.str.contains('#blackmirror')]
tweets_blackmirror_hashtag['tweet']

In [None]:
tweets_blackmirror_hashtag_tweet = tweets_blackmirror_hashtag.tweet

In [59]:
tweets_to_predict = tweets_blackmirror_hashtag_tweet

In [60]:
tweets_to_predict.reset_index(drop=True, inplace=True)

In [61]:
def preprocess_my_way(tweets: pd.Series):
    tweets = tweets.str.lower()
    tweets = tweets.apply(lambda t: clean_stopwords(t))
    tweets = tweets.apply(lambda t: clean_url(t))
    tweets = tweets.apply(lambda t: clean_atsigns(t))
    tweets = tweets.apply(lambda t: clean_punctuation(t))
    tweets = tweets.apply(lambda t: clean_numbers(t))
    tweets = tweets.apply(lambda t: pos_lemma_tweet(t))  
    return tweets

In [63]:
tweets_to_predict_preprocessed = preprocess_my_way(tweets_to_predict)

In [66]:
predicted_labels_flair = tweets_to_predict_preprocessed.apply(lambda t: predict_flair(t))
predicted_labels_nltk_vader = tweets_to_predict_preprocessed.apply(lambda t: predict_nltk_vader(t))
predicted_labels_textblob = tweets_to_predict_preprocessed.apply(lambda t: predict_textblob(t))

In [67]:
print(predicted_labels_flair.value_counts())
print(predicted_labels_nltk_vader.value_counts())
print(predicted_labels_textblob.value_counts())

0    725
1    519
Name: tweet, dtype: int64
0    758
1    486
Name: tweet, dtype: int64
0    842
1    402
Name: tweet, dtype: int64


In [68]:
df_dict = {'tweet': tweets_to_predict_preprocessed,
          'predictions_flair': predicted_labels_flair,
          'predictions_nltk_vader': predicted_labels_nltk_vader,
          'predictions_textblob': predicted_labels_textblob}
sentiment_predictions_df = pd.DataFrame(df_dict)

In [69]:
sentiment_predictions_df

Unnamed: 0,tweet,predictions_flair,predictions_nltk_vader,predictions_textblob
0,cute dog robot strap highpower automatic rifle...,1,1,1
1,next blackmirror anthology include story someo...,1,1,0
2,maybe this blackmirror season episode metalhead,0,0,0
3,like bandersnatch blackmirror,1,1,0
4,remember episode blackmirror nocontexttv netflix,0,0,0
...,...,...,...,...
1239,okay really enjoyed season black mirror except...,0,1,1
1240,yall san junipero poster finally arrive ok sui...,0,0,1
1241,really current season “ black mirror ” blackmi...,1,0,0
1242,come say this he ’ ll get ride truck driver so...,0,0,0
