In [1]:
from nltk import FreqDist
import pickle
import sys
from collections import Counter
from tqdm import tqdm

In [2]:
# define some helper functions

def analyze_tweet(tweet):
    result = {}
    result['POS_EMOS'] = tweet.count('EMO_POS')
    result['NEG_EMOS'] = tweet.count('EMO_NEG')
    words = tweet.split()
    result['WORDS'] = len(words)
    bigrams = get_bigrams(words)
    result['BIGRAMS'] = len(bigrams)
    return result, words, bigrams

def get_bigrams(tweet_words):
    bigrams = []
    num_words = len(tweet_words)
    for i in range(num_words - 1):
        bigrams.append((tweet_words[i], tweet_words[i + 1]))
    return bigrams

def get_bigram_freqdist(bigrams):
    freq_dict = {}
    for bigram in bigrams:
        if freq_dict.get(bigram):
            freq_dict[bigram] += 1
        else:
            freq_dict[bigram] = 1
    counter = Counter(freq_dict)
    return counter

def top_n_words(pkl_file_name, N, shift=0):
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    words = {p[0]: i + shift for i, p in enumerate(most_common)}
    return words

def top_n_bigrams(pkl_file_name, N, shift=0):
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    bigrams = {p[0]: i for i, p in enumerate(most_common)}
    return bigrams

In [3]:
# initialize some variables

N = 20
processed_csv = 'D:\\COMP90024_Assignment2\\sentiment analysis\\data_set_3\\sentiment140-processed.csv'
unique_words_file = 'D:\\COMP90024_Assignment2\\sentiment analysis\\data_set_3\\sentiment140-uniquewords.txt'
unigrams_file = 'D:\\COMP90024_Assignment2\\sentiment analysis\\data_set_3\\sentiment140-freqdist.pkl'
bigrams_file = 'D:\\COMP90024_Assignment2\\sentiment analysis\\data_set_3\\sentiment140-freqdist-bi.pkl'

num_tweets, num_pos_tweets, num_neg_tweets, num_neu_tweets = 0, 0, 0, 0
num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
num_bigrams, num_unique_bigrams = 0, 0
all_words = []
all_bigrams = []

In [4]:
# read from csv

with open(processed_csv, 'r') as csv:
    lines = csv.readlines()
    num_tweets = len(lines)
    for i in tqdm(range(len(lines))):
        line = lines[i]
        t_id, label, tweet = line.strip().split(',')
        label = int(label)
        if label == 4:
            num_pos_tweets += 1
        elif label == 0:
            num_neg_tweets += 1
        else:
            num_neu_tweets += 1
        result, words, bigrams = analyze_tweet(tweet)
        num_pos_emojis += result['POS_EMOS']
        num_neg_emojis += result['NEG_EMOS']
        max_emojis = max(
            max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])
        num_words += result['WORDS']
        min_words = min(min_words, result['WORDS'])
        max_words = max(max_words, result['WORDS'])
        all_words.extend(words)
        num_bigrams += result['BIGRAMS']
        all_bigrams.extend(bigrams)

100%|██████████| 1048956/1048956 [00:07<00:00, 143216.56it/s]


In [5]:
# write results to files

num_emojis = num_pos_emojis + num_neg_emojis
unique_words = list(set(all_words))
with open(unique_words_file, 'w') as uwf:
    uwf.write('\n'.join(unique_words))
num_unique_words = len(unique_words)
num_unique_bigrams = len(set(all_bigrams))
print('\nCalculating frequency distribution...')

# Unigrams
freq_dist = FreqDist(all_words)
with open(unigrams_file, 'wb') as pkl_file:
    pickle.dump(freq_dist, pkl_file)
print('Saved uni-frequency distribution to %s' % unigrams_file)

# Bigrams
bigram_freq_dist = get_bigram_freqdist(all_bigrams)
with open(bigrams_file, 'wb') as pkl_file:
    pickle.dump(bigram_freq_dist, pkl_file)
print('Saved bi-frequency distribution to %s' % bigrams_file)

# Top-N results
print('Calculating top-N results...')
top_words = top_n_words(unigrams_file, N)
top_bigrams = top_n_bigrams(bigrams_file, N)


Calculating frequency distribution...
Saved uni-frequency distribution to D:\COMP90024_Assignment2\sentiment analysis\data_set_3\sentiment140-freqdist.pkl
Saved bi-frequency distribution to D:\COMP90024_Assignment2\sentiment analysis\data_set_3\sentiment140-freqdist-bi.pkl
Calculating top-N results...


In [6]:
# print analytical results

print('\n[Analysis Statistics]')
print('Tweets => Total: %d, Positive: %d, Negative: %d, Neutral: %d' % (num_tweets, num_pos_tweets, num_neg_tweets, num_neu_tweets))
print('Emojis => Total: %d, Positive: %d, Negative: %d, Avg: %.4f, Max: %d' % (num_emojis, num_pos_emojis, num_neg_emojis, num_emojis / float(num_tweets), max_emojis))
print('Words => Total: %d, Unique: %d, Avg: %.4f, Max: %d, Min: %d' % (num_words, num_unique_words, num_words / float(num_tweets), max_words, min_words))
print('Bigrams => Total: %d, Unique: %d, Avg: %.4f' % (num_bigrams, num_unique_bigrams, num_bigrams / float(num_tweets)))
print('\nTop -', N, 'words ==>', top_words)
print('\nTop -', N, 'bigrams ==>', top_bigrams)


[Analysis Statistics]
Tweets => Total: 1048956, Positive: 248958, Negative: 799998, Neutral: 0
Emojis => Total: 8287, Positive: 6649, Negative: 1638, Avg: 0.0079, Max: 16
Words => Total: 13204097, Unique: 208566, Avg: 12.5878, Max: 40, Min: 0
Bigrams => Total: 12157734, Unique: 2366158, Avg: 11.5903

Top - 20 words ==> {'i': 0, 'to': 1, 'the': 2, 'a': 3, 'my': 4, 'and': 5, 'is': 6, 'it': 7, 'you': 8, 'in': 9, 'for': 10, 'im': 11, 'of': 12, 'me': 13, 'on': 14, 'so': 15, 'have': 16, 'but': 17, 'that': 18, 'not': 19}

Top - 20 bigrams ==> {('i', 'have'): 0, ('in', 'the'): 1, ('going', 'to'): 2, ('i', 'dont'): 3, ('i', 'am'): 4, ('i', 'cant'): 5, ('to', 'go'): 6, ('have', 'to'): 7, ('i', 'was'): 8, ('to', 'be'): 9, ('but', 'i'): 10, ('and', 'i'): 11, ('for', 'the'): 12, ('to', 'the'): 13, ('on', 'the'): 14, ('i', 'miss'): 15, ('want', 'to'): 16, ('have', 'a'): 17, ('i', 'think'): 18, ('to', 'get'): 19}
