In [1]:
from nltk import FreqDist
import pickle
import sys
import random
import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix

import import_ipynb
from preprocess import *
from stats import *

importing Jupyter notebook from preprocess.ipynb
importing Jupyter notebook from stats.ipynb


In [2]:
def get_stats(dataframe):
    
    stats = pd.DataFrame(columns=['Total', 'Unique', 'Average', 'Max', 'Positive', 'Negative'],
                        index=['Tweets', 'User Mentions', 'Emoticons', 'URLs', 'Unigrams', 'Bigrams'])
    
    num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
    num_mentions, max_mentions = 0, 0
    num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
    num_urls, max_urls = 0, 0
    num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
    num_bigrams, num_unique_bigrams = 0, 0
    all_words = []
    all_bigrams = []
    
    #counting the number of total, positive, and negative tweets
    num_tweets = len(dataframe)
    for target in dataframe['TARGET']:
        if target:
            num_pos_tweets += 1
        else:
            num_neg_tweets += 1
            
    #analyze text properties
    for tweet in dataframe['TEXT']:
        result, words, bigrams = analyze_tweet(tweet)

        #look at mentions
        num_mentions += result['MENTIONS']
        max_mentions = max(max_mentions, result['MENTIONS'])

        #look at emojis
        num_pos_emojis += result['POS_EMOS']
        num_neg_emojis += result['NEG_EMOS']
        max_emojis = max(max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])

        #look at URLs
        num_urls += result['URLS']
        max_urls = max(max_urls, result['URLS'])

        #look at unigrams
        num_words += result['WORDS']
        min_words = min(min_words, result['WORDS'])
        max_words = max(max_words, result['WORDS'])
        all_words.extend(words)

        #look at bigrams
        num_bigrams += result['BIGRAMS']
        all_bigrams.extend(bigrams)
        
    num_emojis = num_pos_emojis + num_neg_emojis

    #find unique unigrams and bigrams
    unique_words = list(set(all_words))
    unique_bigrams = list(set(all_bigrams))

    #count unique unigrams and bigrams
    num_unique_words = len(unique_words)
    num_unique_bigrams = len(all_bigrams)

    #finding the frequency distribution for unigrams and bigrams
    freq_dist = FreqDist(all_words)
    bigram_freq_dist = get_bigram_freqdist(all_bigrams)
    
    #calculate average of each per tweet
    avg_mentions = num_mentions / num_tweets
    avg_emojis = num_emojis / num_tweets
    avg_urls = num_urls / num_tweets
    avg_words = num_words / num_tweets
    avg_bigrams = num_bigrams / num_tweets
    
    stats['Total'] = [num_tweets, num_mentions, num_emojis, num_urls, num_words, num_bigrams]
    stats['Unique'] = ['-', '-', '-', '-', num_unique_words, num_unique_bigrams]
    stats['Average'] = ['-', avg_mentions, avg_emojis, avg_urls, avg_words, avg_bigrams]
    stats['Max'] = ['-', max_mentions, max_emojis, max_urls, max_words, '-']
    stats['Positive'] = ['-', '-', num_pos_emojis, '-', '-', '-']
    stats['Negative'] = ['-', '-', num_neg_emojis, '-', '-', '-']
    
    return stats

In [3]:
def extract_features(tweets, unigrams, bigrams, VOCAB_SIZE, UNIGRAM_SIZE, USE_BIGRAMS, BATCH_SIZE=500):
    
    num_batches = int(np.ceil(len(tweets) / float(BATCH_SIZE)))
    for i in range(num_batches):
        batch = tweets[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
        features = lil_matrix((BATCH_SIZE, VOCAB_SIZE))
        labels = np.zeros(BATCH_SIZE)
        for j, tweet in enumerate(batch):
            tweet_words = tweet[2][0]
            tweet_bigrams = tweet[2][1]
            labels[j] = tweet[1]
            for word in tweet_words:
                idx = unigrams.get(word)
                if idx:
                    features[j, idx] += 1
            if USE_BIGRAMS:
                for bigram in tweet_bigrams:
                    idx = bigrams.get(bigram)
                    if idx:
                        features[j, UNIGRAM_SIZE + idx] += 1
        yield features, labels

In [4]:
def model_data(dataframe, model_type, unigram_size=15000, use_bigrams=True, bigram_size=10000):
    UNIGRAM_SIZE = unigram_size
    VOCAB_SIZE = UNIGRAM_SIZE
    USE_BIGRAMS = use_bigrams
    if USE_BIGRAMS:
        BIGRAM_SIZE = bigram_size
        VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
    
    np.random.seed(1337)
    
    #get the list of unigrams/bigrams
    unigrams = top_n_words(sample_data, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = top_n_bigrams(sample_data, BIGRAM_SIZE)
    
    #split this into the training data and test data
    tweets = []
    print('Generating feature vectors')
    total = len(dataframe)
    
    
    for information in dataframe.values:
        tweet_id, sentiment, tweet = information
        
        #getting the feature vector
        uni_feature_vector = []
        bi_feature_vector = []
        words = tweet.split()
        #adding each word in the tweet and the word ahead of it for bigrams
        for i in range(len(words) - 1):
            word = words[i]
            next_word = words[i + 1]
            if unigrams.get(word):
                uni_feature_vector.append(word)
            if USE_BIGRAMS:
                if bigrams.get((word, next_word)):
                    bi_feature_vector.append((word, next_word))
        #adding last word to make sure it isn't left out
        if len(words) >= 1:
            if unigrams.get(words[-1]):
                uni_feature_vector.append(words[-1])
        feature_vector = uni_feature_vector, bi_feature_vector
        
        sentiment = 1 if sentiment else 0
        tweets.append((tweet_id, int(sentiment), feature_vector))
    print('\n')
    
    train_tweets, val_tweets = split_data(tweets)
    del tweets
    
    print('Extracting features & training batches')
    
    #initialize the model according to type given by user
    if model_type is 'NAIVEBAYES':
        clf = MultinomialNB()
    elif model_type is 'LOGISTICREGRESSION':
        clf = LogisticRegression()
    elif model_type is 'DECISIONTREE':
        clf = DecisionTreeClassifier()
    elif model_type is 'RANDOMFOREST':
        clf = RandomForestClassifier()
    else:
        raise NameError(model_type + ' is not a valid model type.')
    
    batch_size = len(train_tweets)
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    
    #fit the model
    for training_set_X, training_set_y in extract_features(val_tweets, unigrams, bigrams, VOCAB_SIZE, UNIGRAM_SIZE, USE_BIGRAMS, BATCH_SIZE=batch_size):
        tfidf = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
        tfidf.fit(training_set_X)
        training_set_X = tfidf.transform(training_set_X)
    clf.fit(training_set_X, training_set_y)
    
    #determine accuracy of model
    correct, total = 0, len(val_tweets)
    batch_size = len(val_tweets)
    n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
    for val_set_X, val_set_y in extract_features(val_tweets, unigrams, bigrams, VOCAB_SIZE, UNIGRAM_SIZE, USE_BIGRAMS, BATCH_SIZE=batch_size):
        val_set_X = tfidf.transform(val_set_X)
        prediction = clf.predict(val_set_X)
        correct += np.sum(prediction == val_set_y)
    print('\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total))
    
    #creating confusion matrix
    con_mat = confusion_matrix(prediction, val_set_y)
    
    acc_table = pd.DataFrame(data=con_mat, columns=['Actually Negative', 'Actually Positive'], index=['Classified As Negative', 'Classified As Positive'])
    
    return correct, total, acc_table

In [5]:
data = pd.read_csv("""/Users/jacobliu/Documents/Schoolwork/Computer Science/Projects/TweetSentimentAnalysis/training.1600000.processed.noemoticon.csv""", 
                   names=['TARGET', 'ID', 'TIMESTAMP', 'FLAG', 'USER', 'TEXT'],
                   encoding = "ISO-8859-1")

In [8]:
sample_data = preprocess_dataframe(data, slice_value=1, positive_value=4)
sample_data.head()

Unnamed: 0,ID,TARGET,TEXT
919796,1753884557,True,heaos keen for next weekend mummy
258778,1985270993,False,USER_MENTION i feel all the way round everythi...
865590,1677489256,True,just visited with god and mr god
849630,1565165223,True,USER_MENTION USER_MENTION wow sounds exciting ...
163699,1958106109,False,wish i was wembley


In [9]:
correct_bayes, total_bayes, acc_table_bayes = model_data(sample_data, 'NAIVEBAYES')
correct_tree, total_tree, acc_table_tree = model_data(sample_data, 'LOGISTICREGRESSION')
correct_for, total_for, acc_table_for = model_data(sample_data, 'DECISIONTREE')

Generating feature vectors


Extracting features & training batches

Correct: 79989/160000 = 49.9931 %
Generating feature vectors


Extracting features & training batches


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Correct: 116610/160000 = 72.8812 %
Generating feature vectors


Extracting features & training batches

Correct: 159126/160000 = 99.4537 %
