In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = list(STOPWORDS)

In [None]:
raw_data = pd.read_csv("data_trial_1.csv",encoding='latin-1')
raw_data.columns = [i for i in range(len(raw_data.columns))]
raw_data = raw_data[[0,4,5]].dropna()
raw_data.head()

## doesn't work for some reason
# raw_data = pd.read_csv("data_trial_1.csv",encoding='latin-1')
# raw_data.columns = [['sentiment', 'id','date','query','user','post']]
# raw_data = raw_data[['sentiment','user','post']]
# raw_data

In [None]:
raw_data.info()

In [None]:
contractions = {"aren't": 'are not',
                "ain't": "is not",
                 "she'd": 'she would',
                 "can't": 'cannot',
                 "shouldn't": 'should not',
                 "couldn't": 'could not',
                 "that's": 'that is',
                 "didn't": 'did not',
                 "there's": 'there is',
                 "don't": 'do not',
                 "they're": 'they are',
                 "doesn't": 'does not',
                 "they've": 'they have',
                 "hadn't": 'had not',
                 "they'll": 'they will',
                 "haven't": 'have not',
                 "they'd": 'they had',
                 "he's": 'he has',
                 "wasn't": 'was not',
                 "he'll": 'he will',
                 "we're": 'we are',
                 "he'd": 'he would',
                 "we've": 'we have',
                 "here's": 'here is',
                 "we'll": 'we will',
                 "i'm": 'i am',
                 "we'd": 'we had',
                 "i've": 'i have',
                 "i'll": 'i will',
                 "weren't": 'were not',
                 "i'd": 'i had',
                 "what's": 'what is',
                 "where's": 'where is',
                 "isn't": 'is not',
                 "who's": 'who is',
                 "it's": 'it has',
                 "who'll": 'who will',
                 "won't": 'will not',
                 "wouldn't": 'would not',
                 "it'll": 'it will',
                 "you're": 'you are',
                 "mustn't": 'must not',
                 "you've": 'you have',
                 "she's": 'she has',
                 "you'll": 'you will',
                 "you'd": 'you had',
                 "she'll": 'she will',
                 "gon na": "going to",
                 "gonna" : "going to",
                 "wan na": "want to",
                 "wanna":"want to",
                 " u ": " you ",
                 "got ta":"got to",
                 "gotta" :"got to",
                 " r " : " are "}

# common_word_file = open('long_stopwords.txt','r')
# common_words = []
# for line_ in common_word_file:
#     common_words.append(line_.strip())

spacy_nlp = spacy.load('en_core_web_sm')
common_words = list(spacy.lang.en.stop_words.STOP_WORDS)

def to_ascii (text):
    
    ret = ''
    for char in text:
        if char in "!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ":
            ret+=char
        else:
            ret+=" "+char
    return ret

def clean_text(text):
    
    if type(text)==str:
        text = to_ascii(text)
        text = text.lower()

        text = text.replace(";"," ") 
        text = text.replace("&"," and ")

        text = re.sub("[?!]+",".",text) # should these be filtered out?
    #     text = re.sub("[\W]+"," ",text)
        text = re.sub("@[\w]*"," ",text)
        text = re.sub("\#[\w]*"," ",text)
        text = re.sub("http:[^ ]*"," ",text)
        text = re.sub("[\.]+",".",text)
        text = re.sub("[()]"," ",text)
        text = re.sub("[\s]+"," ",text)
        text = re.sub("[l]{3,}","l",text)
        text = re.sub("[y]{2,}","y",text)
        text = re.sub("[o]{3,}","o",text)
#         text = re.sub("[a-z]{2,}","\0",text)

        for contraction in contractions.keys():
            contraction_alpha = " "+ "".join(re.findall("[a-zA-Z]+", contraction))+" "
            text = text.replace(contraction,contractions[contraction]) 
            text = text.replace(contraction_alpha," " + contractions[contraction] + " ") 
        
        text = text.replace(" ta "," ") 
        text = text.replace(" quot "," ") 
        return text
    
    else:
        ret = []
        for item in text:
            ret.append(clean_text(item))
        return ret

def filter_and_tokenise(sents):
    ret = []
    for sent in tqdm(sents):
        res_sent = []
        for word in word_tokenize(sent):
            if word not in stopwords and word.isalpha() and len(word)>1:
#             if word.isalpha():
                res_sent.append(word)
        ret.append(res_sent)
    return ret

In [None]:
# test_data = raw_data.iloc[:1000].copy()

In [None]:
raw_data[6] = clean_text(raw_data [5])
raw_data[7] = filter_and_tokenise(raw_data [6])
raw_data.head()

In [None]:
# raw_data.to_pickle("df_saved.pkl")
# raw_data = pd.read_pickle("df_saved.pkl")
# raw_data.head()

In [None]:
split_corpus = list(raw_data[7])

bigram_list = []
ctr=0
for ele in tqdm(split_corpus):
    t_bigrams = ngrams(ele,2)
    for ele in t_bigrams:
        if ele[0] in common_words or ele[1] in common_words:
            ctr+=1
            continue
        elif len(ele[0])>2 and len(ele[1])>2 and ele[0]!=ele[1]:
            bigram_list.append((ele[0],ele[1]))
            
bigram_df = pd.DataFrame(pd.Series(bigram_list).value_counts()).reset_index()
bigram_df.columns = ['bigram','frequency']
bigram_df = bigram_df[bigram_df['frequency']>10]
print (len(bigram_df))

In [None]:
bigram_df.iloc[5000:6000]

In [None]:
num_of_bigrams_to_replace = 6000

sent_corpus = []
for ele in split_corpus:
    t_sent = ' '.join(ele)
    sent_corpus.append(t_sent)

print(len(sent_corpus))

merged_corpus_all = '----------'.join(sent_corpus)
bigram_list = list(bigram_df['bigram'])

ctr=0
for ele in tqdm(bigram_list[:num_of_bigrams_to_replace]):
    ctr+=1
    lookup_key = " "+ ele[0]+' '+ele[1]+" "
    replace_key = " "+ ele[0]+'_'+ele[1]+" "
    merged_corpus_all = merged_corpus_all.replace(lookup_key,replace_key)

sent_list = merged_corpus_all.split('----------')

In [None]:
raw_data[8] = sent_list
raw_data[8] = raw_data[8].apply(str.split)
raw_data.head()

In [None]:
list(raw_data[7])

In [None]:
raw_data.to_pickle("df_saved.pkl")
# raw_data = pd.read_pickle("df_saved.pkl")
# raw_data.head()

In [None]:
positive_tweets = raw_data[raw_data[0]==4]
print (positive_tweets.head(),"\n")
positive_tweets.info()

In [None]:
negative_tweets = raw_data[raw_data[0]==0]
print (negative_tweets.head(),"\n")
negative_tweets.info()

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
        ).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()


In [None]:
show_wordcloud(negative_tweets[6],'Most Common Words from the negative corpus')

In [None]:
show_wordcloud(positive_tweets[6],'Most Common Words from the positive corpus')