# CS690V Midterm - Divyesh Harit

Dataset used: 170K UK Tweets Data (http://followthehashtag.com/datasets/170000-uk-geolocated-tweets-free-twitter-dataset/).

The dataset was collected over a period of 1 week in April last year, and contains tweets only from the UK (almost all are from GB and IE).

Goal: Explore the dataset using interactive Bokeh visualizations.

# Part 1: Data handling
# 1.1 Preprocessing

Here, we will perform the following steps:
1. Read in the dataset
2. Extract just the tweets from the dataset
3. Try to remove most of the stop words using NLTK Corpus
4. Strip the tweets of URLs, non-alphanumeric characters and numbers
5. Reduce words to their stem
6. Get word counts of the stemmed words
7. Sort the word counts

In [10]:
#Imports
import pandas as pd
import numpy as np
import csv
import re
from pandas import ExcelWriter
from pandas import ExcelFile
import random
import collections

from sklearn import cluster
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import bokeh
from bokeh.io import output_notebook,output_file
from bokeh.layouts import layout
from bokeh.models import ColumnDataSource, HoverTool, Legend, BoxZoomTool, ResetTool, LassoSelectTool, WheelZoomTool, PanTool
from bokeh.models import Label
from bokeh.plotting import figure, show
from bokeh.layouts import widgetbox, row, column, layout
from bokeh.models import CustomJS, Select, Slider
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", UserWarning)

output_notebook()

In [11]:
#Read data and extract just the tweets
df = pd.read_excel('uk_tweets.xlsx', sheetname = 'Stream', skiprows=[0])
tweets = df[[6]].values.tolist()

In [12]:
#Remove stop words
texts = ','.join(unicode(t) for t in tweets)
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
texts = pattern.sub('', texts)

In [13]:
#Remove URLs, non-alphanumeric characters and numbers
texts_list = texts.split(',')
for counter, t in enumerate(texts_list):
    texts_list[counter] = re.sub(r'[?|$|.|\\|!|#|\-|"|\n|,|@|(|)]',r'',texts_list[counter])
    texts_list[counter] = re.sub(r'https?:\/\/.*[\r\n]*', '', texts_list[counter], flags=re.MULTILINE)
    texts_list[counter] = re.sub(r'[0|1|2|3|4|5|6|7|8|9|:]',r'',texts_list[counter]) 

In [14]:
#Preparing for stemming..
all_words = []
for row in texts_list:
    all_words += row.split(' ')

In [15]:
#Reduce words to their root by stemming, and get word count
ps = PorterStemmer()
stem_wordcount={}
for word in all_words:
    if ps.stem(word) not in stem_wordcount:
        stem_wordcount[ps.stem(word)] = 1
    else:
        stem_wordcount[ps.stem(word)] += 1

In [16]:
stem_wordcount = collections.OrderedDict(sorted(stem_wordcount.items(), key=lambda t: t[1], reverse=True))

# 1.2 Exploration

Here, we will explore the data. What trends are there in the top words obtain by the tweets? We will build an interactive frequency distribution graph to see how the top words are distributed.

In [17]:
#Prepare data

top_words_count = collections.Counter(stem_wordcount).most_common(101)
del top_words_count[0]
top_words = [(i[0]) for i in top_words_count]
top_count = [(i[1]) for i in top_words_count]

split_words = []
for row in top_words:
    split_words.append(row)
top_words = [str(i) for i in split_words]

split_count = []
for row in top_count:
    split_count.append(row)
top_count = [float(i) for i in split_count]


In [18]:
#Plot Frequency distribution

source = ColumnDataSource(data=dict(
    x=np.arange(1,101),
    y=top_count,
    word=top_words,
))

hover = HoverTool(tooltips=[("(Word)", "(@word)"), ("(Freq.)", "(@y)"),])

p = figure(plot_width=900, plot_height=500, title="Frequency distribution of top 100 stemmed words",
          tools = [hover, PanTool(), BoxZoomTool(), ResetTool(), LassoSelectTool(), WheelZoomTool()])
p.vbar(x=np.arange(1,101), width=0.5, top=top_count, color="firebrick", source = source)

show(p)

# Observations
We can see that the top few words have no apparent smooth distribution. But ultimately it stablizes. There's almost uniform distribution in the last 35-100 words. This tells us these are quite common words, either in the english language itself, or the region, or due to some event at that time.

# Part 2 : Analysis and Interactions

# 2.1:  Topic modeling

# a) Clustering by Topic

Here, we cluster words according to the similar inherent topic they belong to, and visualize how often that topic is being mentioned in all the tweets.
Interaction: Hover to see the topic occurence as compared to the total number of tweets.

In [19]:
#Prepare data: Get wordcount of full words instead of just stems
data_for_topic = list(texts_list)
full_wordcount={}
for word in all_words:
    if word not in full_wordcount:
        full_wordcount[word] = 1
    else:
        full_wordcount[word] += 1

In [20]:
#Get names of features
no_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data_for_topic)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [21]:
#Run NMF to model topics for the features
no_topics = 15
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [22]:
#Topics Clustering-Scatter Plot

no_top_words = 15
topic_list = []
colors = ['']
hover = HoverTool(tooltips=[
                ("(Tweets, Topic count)", "($x, $y)"),
                ])
p = figure(plot_width=600, plot_height=550, title="Topics clustering: Topic occurence vs Total Tweets", tools = [hover, PanTool(), BoxZoomTool(), ResetTool(), LassoSelectTool(), WheelZoomTool()])
for topic_idx, topic in enumerate(nmf.components_):
        temp = ""
        temp += " ".join([tfidf_feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        topic_list.append(temp.split(" "))
        topic_word_count = 0
        for word in topic_list[topic_idx]:
            try:
                topic_word_count += full_wordcount[word]
            except:
                continue
        if topic_idx==0:
            p.circle(len(all_words), topic_word_count, size=15, legend="Weather/Temperature", line_color="red", fill_color="red", fill_alpha=0.5)
        elif topic_idx==1:
            p.circle(len(all_words), topic_word_count, size=15, legend="Speed, Distances", fill_color="blue", fill_alpha=0.5)
        elif topic_idx==2:
            p.circle(len(all_words), topic_word_count, size=15,  legend="Things you see in a city", line_color="green", fill_color="green", fill_alpha=0.5)
        elif topic_idx==3:
            p.circle(len(all_words), topic_word_count, size=15, legend="City names", line_color="orange", fill_color="blue", fill_alpha=0.5)
        elif topic_idx==4:
            p.circle(len(all_words), topic_word_count, size=15, legend="Football", line_color="black", fill_color="black", fill_alpha=0.5)
        elif topic_idx==5:
            p.square(len(all_words), topic_word_count, size=15, legend="Wind related", line_color="red", fill_color="red", fill_alpha=0.5)
        elif topic_idx==6:
            p.square(len(all_words), topic_word_count, size=15, legend="Air pressure", line_color="blue", fill_color="blue", fill_alpha=0.5)
        elif topic_idx==7:
            p.square(len(all_words), topic_word_count, size=15, legend="Wind", line_color="green", fill_color="green", fill_alpha=0.5)
        elif topic_idx==8:
            p.square(len(all_words), topic_word_count, size=15, legend="Places to visit", line_color="orange", fill_color="orange", fill_alpha=0.5)
        elif topic_idx==9:
            p.square(len(all_words), topic_word_count, size=15, legend="Rain", line_color="black", fill_color="black", fill_alpha=0.5)
        elif topic_idx==10:
            p.triangle(len(all_words), topic_word_count, size=15, legend="Pressure & Temperature", line_color="red", fill_color="red", fill_alpha=0.5)
        elif topic_idx==11:
            p.triangle(len(all_words), topic_word_count, size=15, legend="UK cities", line_color="blue", fill_color="blue", fill_alpha=0.5)
        elif topic_idx==12:
            p.triangle(len(all_words), topic_word_count, size=15, legend="Greetings/Feelings", line_color="green", fill_color="green", fill_alpha=0.5)
        elif topic_idx==13:
            p.triangle(len(all_words), topic_word_count, size=15, legend="Leisure", line_color="orange", fill_color="orange", fill_alpha=0.5)
        elif topic_idx==14:
            p.triangle(len(all_words), topic_word_count, size=15, legend="Humidity/Rain", line_color="black", fill_color="black", fill_alpha=0.5)
            
p.xaxis.axis_label = "Total tweets"
p.xaxis.axis_label_text_font_style = "italic"

p.yaxis.axis_label = "Total topic count"
p.yaxis.axis_label_text_font_style = "italic"
show(p)    

# Cluster Analysis
Interesting results! A lot of talk about weather/temperature/rain/wind etc. Although UK *is* known for its bad and rainy weather, it still seems a bit unusual that quite a few, especially top topics, are about this. Maybe there were heavy rains and/or weather fluctuations at that time, and people took to Twitter to moan?

After doing manual analysis, it appears the reality isn't as amusing - Quite a few tweets came from accounts dedicated to weather reports and updates.

# b) Interactive topic exploration

Here, we manipulate the number of words chosen in a topic while modeling. We can do this by using an interactive slider that changes the number of words per topic, for every topic.

In [23]:
#Prepare figure
topic_list = []
p = figure(plot_width=600, plot_height=550, title="Topics clustering: Topic occurence vs Total Tweets", tools = [PanTool(), BoxZoomTool(), ResetTool(), LassoSelectTool(), WheelZoomTool()])

for topic_idx, topic in enumerate(nmf.components_):
		temp = ""
		temp += " ".join([tfidf_feature_names[i] for i in topic.argsort()[:-15 - 1:-1]])
		topic_list.append(temp.split(" "))
		topic_word_count = 0
		for word in topic_list[topic_idx]:
		    try:
		        topic_word_count += full_wordcount[word]
		    except:
		        continue
		mybar = p.vbar(x=[topic_idx], width=0.2, bottom=0, top=[topic_word_count], color="firebrick")

p.xaxis.axis_label = 'Topic ID'
p.yaxis.axis_label = 'Topic Word Count'

In [24]:
#Method to take in custom number of words and redraw bar graphs
from bokeh.io import push_notebook, show
def topic_modeling(number_of_words):
    topic_list = []
    for topic_idx, topic in enumerate(nmf.components_):
        temp = ""
        temp += " ".join([tfidf_feature_names[i] for i in topic.argsort()[:-number_of_words - 1:-1]])
        topic_list.append(temp.split(" "))
        topic_word_count = 0
        for word in topic_list[topic_idx]:
            try:
                topic_word_count += full_wordcount[word]
            except:
                continue
        mybar = p.vbar(x=[topic_idx], width=0.2, bottom=0, top=[topic_word_count], color="firebrick")
    push_notebook(handle=bokeh_handle)

In [25]:
#Callback
callback = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
    var kernel = IPython.notebook.kernel;
    cmd = "topic_modeling(" + cb_obj.value + ")";
    kernel.execute(cmd, {}, {});
}
""")
    
#Slider to change clusters
slider = Slider(title="Number of words in a topic", value=15, start=5, end=30, step=1, width=200,
                callback=callback)

bokeh_handle = show(column(slider, p), notebook_handle=True)

# 2.2: Clustering using Bag of Words: K-Means
Here, we will shake up the words to lose their order using the bag of words approach. After that we can perform TruncatedSVD dimensionality reduction to visualize it in 2D.

Interaction: Slider to change the number of clusters.

In [26]:
#assign a color to each label
def get_colors(labels):
    colors=[]
    for i in labels:
        if i==0.0 or i==0:
            colors.append('red')
        if i==1.0 or i==1:
            colors.append('#CAB2D6')
        if i==2.0 or i==2:
            colors.append('blue')
        if i==3.0 or i==3:
            colors.append('green')
        if i==4.0 or i==4:
            colors.append('black')
        if i==5.0 or i==5:
            colors.append('brown')
        if i==6.0 or i==6:
            colors.append('gray')
        if i==7.0 or i==7:
            colors.append('navy')
        if i==8.0 or i==8:
            colors.append('orange')
        if i==9.0 or i==9:
            colors.append('olive')
    return colors

In [27]:
df2 = pd.read_excel('uk_tweets.xlsx', sheetname = 'Stream')
df2 = df2['Tweet content'].tolist()

In [28]:
#Prepare data for pca and clustering
def create_pipeline():
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df2)
    return X

#PCA Dimensionality Reduction
def do_pca(X):
    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42).fit(X)
    data2D = svd.transform(X) 
    return data2D

#Cluster
def clustering(number_of_clusters):
    kmeans = cluster.KMeans(n_clusters=number_of_clusters)
    return kmeans

#Initial, default clustering
clf = clustering(2)
data_for_kmeans = create_pipeline()
data2D = do_pca(data_for_kmeans)
clf.fit(data_for_kmeans)
labels = clf.labels_
colors = get_colors(labels)

#Initial, default plot
source = ColumnDataSource(data=dict(x=data2D[:,0], y=data2D[:,1], colors=colors))
plot = figure(width=550, height=550, title='Clustering By Tweets', tools = [PanTool(), BoxZoomTool(), ResetTool(), LassoSelectTool(), WheelZoomTool()])
plot.circle('x','y', fill_color='colors', line_color='colors', source=source)

def update_clusters(value):
    clusters = value
    clf = clustering(clusters)
    data_for_kmeans = create_pipeline()
    data2D = do_pca(data_for_kmeans)
    clf.fit(data_for_kmeans)
    labels = clf.labels_ 
    colors = get_colors(labels)
    source.data = dict(x=data2D[:,0], y=data2D[:,1], colors=colors)
    push_notebook(handle=bokeh_handle)


In [32]:
callback = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
    var kernel = IPython.notebook.kernel;
    cmd = "update_clusters(" + cb_obj.value + ")";
    kernel.execute(cmd, {}, {});
}
""")
    
#Slider to change clusters
slider = Slider(title="Number of clusters", value=2, start=2, end=10, step=1, width=200,
                callback=callback)

bokeh_handle = show(column(slider, plot), notebook_handle=True)

# Cluster Analysis
As expected, when number of clusters are less, the points are divided into bigger clusters. As we interact with the slider and increase the cluster size, the same number of points get divided into the selected clusters, with each cluster having smaller sizes than before. There's some overlap between the clusters, highlighting the weaknesses of a basic K-Means clustering algorithm, and difficulty of clear-cut seperation when it comes to words that belong in different clusters but might be slightly related, and vice-versa.