Oulu_NLPTM_TwitterBrexit

In [None]:
import re
from collections import Counter
import nltk

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
def load(filename):
    tweet_list = []
    with open(filename, 'r') as f:
        for tweet in f:
            tweet_list.append(tweet.strip())
            
    print(f"loaded tweets from {filename}")
    return tweet_list

In [None]:
conservative_tweets = load("conservative_tweets_preprocessed.txt")
labour_tweets = load("labour_tweets_preprocessed.txt")

In [None]:
#getting top 10 frequent words
def top10(tweets):
    list_of_words = []
    for tweet in tweets:
        words = tweet.split()
        list_of_words.append(words)
    flat_list = [item for sublist in list_of_words for item in sublist]

    word_counts = Counter(flat_list)
    return word_counts.most_common(10)

In [None]:
top10_conservative = top10(conservative_tweets)
top10_labour = top10(labour_tweets)
print('TOP10 words by conservatives: ')
for i, word in enumerate(top10_conservative):
    print(str(i+1) + ". wordstem:'" +  str(word[0]) + "' occurences: " + str(word[1]))

print()
print('TOP10 words by labour: ')
for i, word in enumerate(top10_labour):
    print(str(i+1) + ". wordstem:'" +  str(word[0]) + "' occurences: " + str(word[1]))


In [None]:
top10_conservative_wordset = set([word[0] for word in top10_conservative])
top10_labour_wordset = set([word[0] for word in top10_labour])

jaccard_index_top10 = nltk.jaccard_distance(top10_conservative_wordset, top10_labour_wordset)

print('Jaccard distance based on TOP10 most frequent words: ' + str(jaccard_index_top10))

In [None]:
# from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()
    
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
def get_topic_words(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    topic_words = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words.extend([words[i]for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topic_words

In [None]:
# Initialise the count vectorizer with the English stop words
count_vectorizer_conservative = CountVectorizer()
count_vectorizer_labour = CountVectorizer()
# Fit and transform the processed titles
count_data_conservative = count_vectorizer_conservative.fit_transform(conservative_tweets)

# Fit and transform the processed titles
count_data_labour = count_vectorizer_labour.fit_transform(labour_tweets)

# Visualise the 10 most common words
#print("conservative")
#plot_10_most_common_words(count_data_conservative, count_vectorizer_conservative)

#print("labour")
#plot_10_most_common_words(count_data_labour, count_vectorizer_labour)

In [None]:
# Tweak the two parameters below
number_topics = 5
number_words = 10

# Create and fit the LDA model
lda_conservative = LDA(n_components=number_topics, n_jobs=-1)
lda_conservative.fit(count_data_conservative)

# Print the topics found by the LDA model
#print("Topics found via LDA for conservative tweets:")
#print_topics(lda_conservative, count_vectorizer_conservative, number_words)

# Create and fit the LDA model
lda_labour = LDA(n_components=number_topics, n_jobs=-1)
lda_labour.fit(count_data_labour)

# Print the topics found by the LDA model
#print("Topics found via LDA for labour tweets:")
#print_topics(lda_labour, count_vectorizer_labour, number_words)

In [None]:
topic_words_conservative = set(get_topic_words(lda_conservative, count_vectorizer_conservative, number_words))
topic_words_labour = set(get_topic_words(lda_labour, count_vectorizer_labour, number_words))

#print(topic_words_conservative)
#print(topic_words_labour)

jaccard_index_topics = nltk.jaccard_distance(topic_words_conservative, topic_words_labour)

print('Jaccard distance for topic words: ' + str(jaccard_index_topics))