In [246]:
#%matplotlib notebook
%matplotlib inline
#Module to handle regular expressions
import re
#Library for emoji
import emoji
#Import pandas and numpy to handle data
import pandas as pd
import numpy as np

#import libraries for accessing the database
import psycopg2
from sqlalchemy import create_engine
from postgres_credentials import *

#import libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

#Import nltk to check english lexicon
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import (
    wordnet,
    stopwords
)

#import libraries for tokenization and ML
import json;
import keras;
import keras.preprocessing.text as kpt;
#from keras.preprocessing.text import Tokenizer;

import sklearn
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)
from sklearn.model_selection import train_test_split

#Import all libraries for creating a deep neural network
#Sequential is the standard type of neural network with stackable layers
from keras.models import Sequential;
#Dense: Standard layers with every node connected, dropout: avoids overfitting
from keras.layers import Dense, Dropout, Activation;

In [154]:
#Querying the database
def query_database(tabletweets):
    engine = create_engine("postgresql+psycopg2://%s:%s@%s:%d/%s" %(usertwitter, passwordtwitter, hosttwitter, porttwitter, dbnametwitter))
    table = pd.read_sql_query('select * from %s' %tabletweets,con=engine, index_col='id')
    return table

In [155]:
#preprocess text in tweets by removing links, @UserNames, blank spaces, etc.
def preprocessing_text(table):
    #put everythin in lowercase
    table['tweet'] = table['tweet'].str.lower()
    #Replace rt indicating that was a retweet
    table['tweet'] = table['tweet'].str.replace('rt', '')
    #Replace occurences of mentioning @UserNames
    table['tweet'] = table['tweet'].replace(r'@\w+', '', regex=True)
    #Replace links contained in the tweet
    table['tweet'] = table['tweet'].replace(r'http\S+', '', regex=True)
    table['tweet'] = table['tweet'].replace(r'www.[^ ]+', '', regex=True)
    #remove numbers
    table['tweet'] = table['tweet'].replace(r'[0-9]+', '', regex=True)
    #replace special characters and puntuation marks
    table['tweet'] = table['tweet'].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
    return table    

In [156]:
#Replace elongated words by identifying those repeated characters and then remove them and compare the new word with the english lexicon
def in_dict(word):
    if wordnet.synsets(word):
        #if the word is in the dictionary, we'll return True
        return True

def replace_elongated_word(word):
    regex = r'(\w*)(\w+)\2(\w*)'
    repl = r'\1\2\3'    
    if in_dict(word):
        return word
    new_word = re.sub(regex, repl, word)
    if new_word != word:
        return replace_elongated_word(new_word)
    else:
        return new_word

def detect_elongated_words(row):
    regexrep = r'(\w*)(\w+)(\2)(\w*)'
    words = [''.join(i) for i in re.findall(regexrep, row)]
    for word in words:
        if not in_dict(word):
            row = re.sub(word, replace_elongated_word(word), row)
    return row 

In [157]:
def stop_words(table):
    #We need to remove the stop words
    stop_words_list = stopwords.words('english')
    table['tweet'] = table['tweet'].str.lower()
    table['tweet'] = table['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words_list)]))
    return table

In [247]:
def replace_antonyms(word):
    #We get all the lemma for the word
    for syn in wordnet.synsets(word): 
        for lemma in syn.lemmas(): 
            #if the lemma is an antonyms of the word
            if lemma.antonyms(): 
                #we return the antonym
                return lemma.antonyms()[0].name()
            
def handling_negation(row):
    #Tokenize the row
    words = word_tokenize(row)
    #We obtain the type of words that we have in the text, we use the pos_tag function
    tags = nltk.pos_tag(words)
    #Now we ask if we found a negation in the words
    tags_2 = ''
    if "n't" in words and "not" in words:
        tags_2 = tags[min(words.index("n't"), words.index("not")):]
        words = words[min(words.index("n't"), words.index("not")):]
    elif "n't" in words:
        tags_2 = tags[words.index("n't"):]
        words = words[words.index("n't"):] 
    elif "not" in words:
        tags_2 = tags[words.index("not"):]
        words = words[words.index("not"):]
        
    for index, word_tag in enumerate(tags):
        if word_tag[1] == 'JJ' or word_tag[1] == 'JJR' or word_tag[1] == 'JJS':
            words = words[:index]+[replace_antonyms(word_tag[0])]+words[index+1:]
            break
            
    return ' '.join(words)    

In [None]:
#def remove duplicates():

In [248]:
def cleaning_table(table):
    #This function will process all the required cleaning for the text in our tweets
    table = preprocessing_text(table)
    table['tweet'] = table['tweet'].apply(lambda x: detect_elongated_words(x))
    table['tweet'] = table['tweet'].apply(lambda x: handling_negation(x))
    table = stop_words(table)
    #table = stemming_tweets()
    return table

In [163]:
#Vectorization for Data Visualization
def vectorization(table):
    #CountVectorizer will convert a collection of text documents to a matrix of token counts
    #Produces a sparse representation of the counts 
    #Initialize
    vector = CountVectorizer()
    #We fit and transform the vector created
    frequency_matrix = vector.fit_transform(table.tweet)
    #Sum all the frequencies for each word
    sum_frequencies = np.sum(frequency_matrix, axis=0)
    #Now we use squeeze to remove single-dimensional entries from the shape of an array that we got from applying np.asarray to
    #the sum of frequencies.
    frequency = np.squeeze(np.asarray(sum_frequencies))
    #Now we get into a dataframe all the frequencies and the words that they correspond to
    frequency_df = pd.DataFrame([frequency], columns=vector.get_feature_names()).transpose()
    return frequency_df

In [164]:
#Split Data into training and test dataset
def splitting(table):
    X_train, X_test, y_train, y_test = train_test_split(table.tweet, table.sentiment, test_size=0.33, shuffle=True)
    return X_train, X_test, y_train, y_test

In [214]:
#Tokenization for analysis
def tokenization_tweets(dataset):
    tokenization = TfidfVectorizer(max_features=50)
    tokenization.fit(dataset)
    dataset_transformed = tokenization.transform(dataset).toarray()
    return dataset_transformed

In [227]:
#Create a Neural Network
#Create the model
def train(X_train_mod, y_train):
    model_nn = Sequential()
    model_nn.add(Dense(512, input_shape=(50,), activation='relu'))
    model_nn.add(Dropout(0.5))
    model_nn.add(Dense(256, activation='sigmoid'))
    model_nn.add(Dropout(0.5))
    model_nn.add(Dense(1, activation='softmax'))
    

    model_nn.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
    model_nn.fit(np.array(X_train_mod), y_train,
                 batch_size=32,
                 epochs=5,
                 verbose=1,
                 validation_split=0.1,
                 shuffle=True)
    return model_nn

In [243]:
def test(X_test, y_test, model_nn):
    prediction = model_nn.predict(X_test)
    return prediction

In [None]:
if __name__ == "__main__":
    tabletweets = 'tweets_avengers'
    tweet_table = query_database(tabletweets)

    tweet_table = cleaning_table(tweet_table) 
    
    #First we draw a word cloud
    #For All tweets
    tweets_list = pd.Series([t for t in tweet_table.tweet]).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(tweets_list)
    plt.figure(figsize=(12,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show() 
    
    #For positive tweets
    tweets_list = pd.Series([t for t in tweet_table.tweet]).str.cat(sep=' ')
    wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(tweets_list)
    plt.figure(figsize=(12,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show() 
    
    #For negative tweets
    
    
    
    
    #Graph with frequency words
    #Vectorize all, positive and negative tweets and get the frequency
    word_frequency = vectorization(tweet_table).sort_values(0, ascending = False)
    wordfrequency_positive = vectorization(tweet_table[tweet_table['sentiment'] == 'positive']).sort_values(0, ascending = False)
    wordfrequency_negative = vectorization(tweet_table[tweet_table['sentiment'] == 'negative']).sort_values(0, ascending = False)
    
    #Get labels (words) for all, pos and neg tweets
    labels = word_frequency[0][1:51].index
    labels_positive = word_frequency_positive[0][1:51].index
    labels_negative = word_frequency_positive[0][1:51].index
    
    #Plot the figures
    plt.subplots(2, 2, sharex=False, sharey=False)
    barfreq = plt.bar(np.arange(50), word_frequency[0][1:51], width = 0.8, color = sns.color_palette("bwr"), alpha=0.5, edgecolor = 'black', capsize=8, linewidth=1);
    plt.xticks(np.arange(50), labels, rotation=90, size=14);
    plt.xlabel('50 more frequent words', size=14);
    plt.ylabel('Frequency', size=14);
    plt.title('Word Frequency', size=18);
    plt.grid(False);
    plt.gca().spines['top'].set_visible(False);
    plt.gca().spines['right'].set_visible(False);
    plt.show()

In [None]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = splitting(tweet_table)
    X_train_mod = tokenization_tweets(X_train)
    model = train(X_train_mod, y_train)
    test(X_test, y_test, model)

In [272]:
table = tweet_table
table = table.tweet.drop_duplicates(keep='first').dropna()

id
1         video lee sugeun youtube channel subscribers s...
2                   avengers trailer told spongebob friends
3         ok watched avengers infinity war first time kn...
4                                     always good know back
5         making new oreo products used easy like “let’s...
7         drawing board today scarletwitch avengers marv...
11        theory avengers endgame trailer see scott van ...
12        avengers wong genius care anyone else says😂😂😂 ...
13                                      avengers age ultron
14                                     seriously can't wait
18                                    avengers mike mignola
20        made avengers meme laugh away pain endgame tra...
21        avengers endgame imax trailer hits theaters we...
24                                             smh avengers
25        didn’t chris evans along jeremy rener call bla...
34        loss something watch best movies new netflix d...
35        dory's avengers alison jack