In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing
import re

In [2]:
# import needed libraries
import nltk
import gensim
import sklearn

In [3]:
os.chdir(os.path.join(os.getcwd(), "..", "..", "data", "preprocessed"))

In [4]:
# read the data
df = pd.read_csv("2.1-sh-data-preprocessed.csv", encoding = "ISO-8859-1")

In [5]:
# check the data
df.head()

Unnamed: 0,tweet,cleaned_tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,dead need upgrade,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,likely design also give free,2
2,@swonderlin Can not wait for #iPad 2 also. The...,wait also sale,2
3,@sxsw I hope this year's festival isn't as cra...,hope year festival year,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,great stuff tech,2


In [6]:
cleaned_tweet = df["cleaned_tweet"]

In [7]:
num_features = max(cleaned_tweet.apply(lambda text: len(nltk.word_tokenize(text))))

In [8]:
print("Total number of text features: ", num_features)

Total number of text features:  14


## Model Representation

In [9]:
tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df = 0.02,
                                                                max_features = num_features,
                                                                ngram_range = [1,3])

In [10]:
dtm_tf = tf_vectorizer.fit_transform(cleaned_tweet)

In [11]:
dtm_tf.shape

(8555, 14)

In [12]:
tf_features = tf_vectorizer.get_feature_names()

In [13]:
tf_features

['apple',
 'called',
 'free',
 'go',
 'line',
 'open',
 'party',
 'see',
 'social',
 'temporary',
 'time',
 'today',
 'use',
 'win']

In [14]:
processed_tweets = df["cleaned_tweet"].apply(lambda x: [word for word in x.split() if word in tf_features])

### Bag of Words (BOW)

In [15]:
# inspired by https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
# set up a dictionary of words
dictionary = gensim.corpora.Dictionary(processed_tweets)

In [16]:
# get the length of dictionary
len(dictionary)

14

In [17]:
# filter out the top 1000 words that appear in at least 20 documents
dictionary.filter_extremes(no_below=20, keep_n=1000)

In [18]:
# get the length of dictionary
len(dictionary)

14

In [19]:
# convert the dictionary into a tuple which maps the word to the number of count
bow_corpus = [dictionary.doc2bow(desc) for desc in processed_tweets]

### Run LDA with BOW (TF)

In [20]:
# define the lda model with the bow corpus
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=15,
                                       alpha='asymmetric',
                                       eta='auto',
                                       id2word=dictionary,
                                       passes=20,
                                       iterations=500,
                                       random_state=123)

In [21]:
# print out each topic
topics = {}
for index, topic in lda_model.print_topics(-1):
    topics[index] = re.findall('\"(\w+)"', topic)

for topic in topics:
    print('Topic: {} \n{}'.format(topic, topics[topic]))
    print("")

Topic: 0 
['go', 'see', 'use', 'apple', 'line', 'today', 'time', 'party', 'called', 'social']

Topic: 1 
['open', 'temporary', 'apple', 'win', 'free', 'line', 'social', 'go', 'today', 'party']

Topic: 2 
['free', 'called', 'party', 'go', 'see', 'line', 'apple', 'social', 'win', 'time']

Topic: 3 
['win', 'free', 'open', 'temporary', 'go', 'apple', 'party', 'time', 'see', 'use']

Topic: 4 
['party', 'go', 'time', 'open', 'line', 'apple', 'free', 'called', 'social', 'today']

Topic: 5 
['apple', 'line', 'party', 'win', 'see', 'use', 'called', 'go', 'open', 'today']

Topic: 6 
['today', 'win', 'go', 'see', 'line', 'apple', 'free', 'open', 'called', 'party']

Topic: 7 
['social', 'today', 'free', 'time', 'use', 'called', 'see', 'go', 'party', 'open']

Topic: 8 
['time', 'win', 'use', 'free', 'open', 'apple', 'social', 'go', 'see', 'line']

Topic: 9 
['use', 'called', 'open', 'today', 'social', 'apple', 'free', 'line', 'party', 'win']

Topic: 10 
['use', 'called', 'open', 'today', 'social',

In [22]:
# get the count of each word in the topic
word_dict = {}
for index in topics:
    for word in topics[index]:  
        if (word in word_dict.keys()):
            word_dict[word] = word_dict[word] + 1
        else:
            word_dict[word] = 1

# show the sorted word dictionary for the topics
dict(sorted(word_dict.items(), key=lambda x: x[1]))

{'temporary': 3,
 'time': 7,
 'see': 9,
 'called': 10,
 'go': 11,
 'use': 11,
 'today': 11,
 'win': 11,
 'party': 12,
 'social': 12,
 'line': 13,
 'open': 13,
 'free': 13,
 'apple': 14}

### Evaluate Model

In [23]:
# select a random line to evaluate
line = random.randint(0,len(processed_tweets))
desc = processed_tweets[line]
print("instance number:", line)
print("")
print("entry real:", df["tweet"][line])
print("")
print("entry cleaned:", df["cleaned_tweet"][line])

instance number: 5197

entry real: RT @mention Bad news update: the pop-up Apple Store is out of iPads! Not sure if they will have more by tomorrow. #SXSW

entry cleaned: news update sure tomorrow


In [24]:
for index, score in sorted(lda_model[bow_corpus[line]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nIndex: {} \nTopic: {}".format(score*100, index, topics[index]))


Score: 15.263602137565613	 
Index: 0 
Topic: ['go', 'see', 'use', 'apple', 'line', 'today', 'time', 'party', 'called', 'social']

Score: 12.131310999393463	 
Index: 1 
Topic: ['open', 'temporary', 'apple', 'win', 'free', 'line', 'social', 'go', 'today', 'party']

Score: 10.065697878599167	 
Index: 2 
Topic: ['free', 'called', 'party', 'go', 'see', 'line', 'apple', 'social', 'win', 'time']

Score: 8.601166307926178	 
Index: 3 
Topic: ['win', 'free', 'open', 'temporary', 'go', 'apple', 'party', 'time', 'see', 'use']

Score: 7.508675009012222	 
Index: 4 
Topic: ['party', 'go', 'time', 'open', 'line', 'apple', 'free', 'called', 'social', 'today']

Score: 6.662435084581375	 
Index: 5 
Topic: ['apple', 'line', 'party', 'win', 'see', 'use', 'called', 'go', 'open', 'today']

Score: 5.9876203536987305	 
Index: 6 
Topic: ['today', 'win', 'go', 'see', 'line', 'apple', 'free', 'open', 'called', 'party']

Score: 5.436932295560837	 
Index: 7 
Topic: ['social', 'today', 'free', 'time', 'use', 'calle

### Save the dataframe back for modelling 

In [25]:
dtm_tf_arr = dtm_tf.toarray()

In [26]:
dtm_tf_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
df = df.drop(['tweet', 'cleaned_tweet'], axis=1)

In [28]:
df.head()

Unnamed: 0,emotion
0,0
1,2
2,2
3,0
4,2


In [29]:
df_bow = pd.DataFrame(list(map(np.ravel, dtm_tf_arr)))

In [30]:
df_bow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# add the bow columsn to the dataframe
new_df = pd.concat([df, df_bow], axis=1)
new_df.head()

Unnamed: 0,emotion,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# save the new df for further use
new_df.to_csv("2.2-sh-data-preprocessed.csv", index=False)