## Load libraries

In [1]:
!pip install comet_ml



In [2]:
from comet_ml import Experiment

In [None]:
experiment = Experiment(#api_key=YOUR API KEY HERE, project_name='', workspace='')

In [None]:
#MOVE THIS CELL WHERE APPROPRIATE
#Check "Comet starter notebook" preprocessing for more info
#Remember to save new parameters and metrics in a dictionary for logging
#params = {}
#metrics = {}

#Log parameters and results
#experiment.log_parameters(params)
#experiment.log_metrics(metrics)

In [None]:
#MOVE THIS CELL WHERE APPROPRIATE
#Check "Comet starter notebook" preprocessing for more info
#experiment.end()

In [37]:
import numpy as np
import pandas as pd
import spacy
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
nlp = spacy.load('en_core_web_lg')

In [39]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\piala\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [40]:
pip install wordcloud

Note: you may need to restart the kernel to use updated packages.


In [41]:
import wordcloud

In [42]:
from wordcloud import WordCloud

## Load data

In [43]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


## Data preprocessing

In [None]:
#Things to consider:
#Remove 'RT ', '@___:', '#', 'urls'
#How does spelling affect the analysis?

In [44]:
print(len(df))
print(len(set(df['message'])))
print(df.isna().sum())

15819
14229
sentiment    0
message      0
tweetid      0
dtype: int64


In [45]:
#Drop duplicate tweets
df = df.drop_duplicates(['message'])
print(df.head())
print(len(df))
print(len(set(df['message'])))

   sentiment                                            message  tweetid
0          1  PolySciMajor EPA chief doesn't think carbon di...   625221
1          1  It's not like we lack evidence of anthropogeni...   126103
2          2  RT @RawStory: Researchers say we have three ye...   698562
3          1  #TodayinMaker# WIRED : 2016 was a pivotal year...   573736
4          1  RT @SoyNovioDeTodas: It's 2016, and a racist, ...   466954
14229
14229


In [46]:
#Remove spaces after hashtags
for i in range(len(df)):
    for j in re.finditer('# ', df.iloc[i, 1]):
        if j.span() is not None:
            df.iloc[i, 1] = re.sub(df.iloc[i, 1][j.span()[0]:j.span()[1]], r'#', df.iloc[i, 1])
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker#WIRED : 2016 was a pivotal year ...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [47]:
#Remove hashtags, mentions and urls
regex_pattern = ['#[\w]*', '@[\w]*:', '@[\w]*', '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*']
for i in range(len(df)):
    for j in range(len(regex_pattern)):
        for k in re.finditer(regex_pattern[j], df.iloc[i, 1]):
            if k.span() is not None:
                df.iloc[i, 1] = re.sub(k.group().strip(), '', df.iloc[i, 1])
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT Researchers say we have three years to act...,698562
3,1,: 2016 was a pivotal year in the war on clima...,573736
4,1,"RT It's 2016, and a racist, sexist, climate c...",466954


In [48]:
#Reduce strings of repeated letters down to 2 eg looooooooove becomes loove (easier to correct later)
for i in range(len(df)):
    df.iloc[i, 1] = re.sub(r'([a-z])\1+', r'\1\1', df.iloc[i, 1])
    df.iloc[i, 1] = re.sub(r':', r'', df.iloc[i, 1])
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT Researchers say we have three years to act...,698562
3,1,2016 was a pivotal year in the war on climat...,573736
4,1,"RT It's 2016, and a racist, sexist, climate c...",466954


In [59]:
#print(nlp.Defaults.stop_words)

In [None]:
#Remove stop words from messages. Make a second column of message because code removes capital letters.
df['msg_stop_removed'] = df['message']
df['msg_stop_removed'] = df.msg_stop_removed.str.replace("[^\w\s]", "").str.lower()
df['msg_stop_removed'] = df['msg_stop_removed'].apply(lambda x: ' '.join([item for item in x.split() if item not in nlp.Defaults.stop_words]))
df.head()

In [None]:
tokens_without_sw = [word for word in word_tokenize(text) if not word in nlp.Defaults.stop_words]

In [49]:

sid = SentimentIntensityAnalyzer()

In [51]:
slang = pd.read_csv('SlangSD/SlangSD.txt', sep='\t', names=['word', 'score'])

In [52]:
slang_dict = dict(zip(slang['word'], slang['score']))

In [58]:
sid.lexicon.update(slang_dict)

In [24]:
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *

In [26]:
from nltk.corpus import opinion_lexicon

In [28]:
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\piala\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\opinion_lexicon.zip.


True

In [62]:
opinion_lexicon.negative()

['2-faced', '2-faces', 'abnormal', 'abolish', ...]

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

## Feature engineering

In [15]:
df['length'] = df['message'].apply(lambda i: len(i))
df.head()

Unnamed: 0,sentiment,message,tweetid,length
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,108
1,1,It's not like we lack evidence of anthropogeni...,126103,62
2,2,RT Researchers say we have three years to act...,698562,88
3,1,2016 was a pivotal year in the war on climat...,573736,55
4,1,"RT It's 2016, and a racist, sexist, climate c...",466954,91


In [16]:
df['compound']  = df['message'].apply(lambda review: sid.polarity_scores(review)).apply(lambda score_dict: score_dict['compound'])
df.head()

NameError: name 'sid' is not defined

## EDA

In [None]:
sns.countplot(df['sentiment'])
plt.show()

In [None]:
#Wordcloud of tweets
wordcloud = WordCloud().generate(' '.join(df[df['sentiment'] == 2]['message']))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
#Wordcloud of tweets
wordcloud = WordCloud().generate(' '.join(df[df['sentiment'] == 1]['message']))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
#Wordcloud of tweets
wordcloud = WordCloud().generate(' '.join(df[df['sentiment'] == 0]['message']))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
#Wordcloud of tweets
wordcloud = WordCloud().generate(' '.join(df[df['sentiment'] == -1]['message']))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Data cleaning

In [None]:
df[df['sentiment'] == 1]['message'].head()

## Modelling

## Model evaluation