## Exercise 3: Textual Data Analysis
##### Jade Watson: 20052115

### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

datasetPath = r"C:/Users/Jade Watson/Documents/CMPE 351/Exercise3//"
print(os.listdir(datasetPath))

['.idea', 'CISC_CMPE351_W22_E3.pdf', 'corpus.pkl', 'dictionary.gensim', 'Exercise3.ipynb', 'JadeWatsonExercise3.ipynb', 'model1.gensim', 'model1.gensim.expElogbeta.npy', 'model1.gensim.id2word', 'model1.gensim.state', 'model10.gensim', 'model10.gensim.expElogbeta.npy', 'model10.gensim.id2word', 'model10.gensim.state', 'model5.gensim', 'model5.gensim.expElogbeta.npy', 'model5.gensim.id2word', 'model5.gensim.state', 'Questions.csv', 'Questions.csv.zip', 'run_word2vec.ipynb', 'w2v_model.txt']


In [2]:
# Style Dataframe
df = pd.read_csv(datasetPath + "Questions.csv")
df = df.sample(frac=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,25063739,1980846.0,2014-07-31T16:00:55Z,2,R tcl tk: How do I pass a variable to a button...,<p>How would I pass the value of <code>num</co...
1,35306295,996366.0,2016-02-10T03:26:06Z,4,How to stop running shiny app by closing the b...,<p>I have deployed a app in <strong>shinyapps....
2,24127787,3723583.0,2014-06-09T19:46:05Z,0,"Installation error ""no package specified"" - tr...","<p><strong>When I tried git-hub install, using..."
3,40260119,1901071.0,2016-10-26T10:42:00Z,0,Building Sentences from a dataframe in R,<p>Im trying to generate sentences from a data...
4,16783551,2426904.0,2013-05-28T03:46:13Z,1,Downloading multiple file as parallel in R,"<p>I am trying to download 460,000 files from ..."
5,36937359,4701887.0,2016-04-29T11:37:04Z,2,How to extract the lower triangle of a Distanc...,<p>I have a distance matrix called <code>mydis...
6,44098910,5472628.0,2017-05-21T15:58:18Z,0,What is an appropriate approach to study seque...,"<p>I have a data set with customer ID, event_d..."
7,45936144,1819625.0,2017-08-29T10:00:31Z,2,Transition plot with time on x axis,<p>I have a transition matrix as following:</p...
8,24425588,2849053.0,2014-06-26T08:16:28Z,1,Attribute information gain,<p>I am an R user and I am interested in findi...
9,36957580,6275599.0,2016-04-30T16:49:21Z,1,R Sum columns by index,<p><p>I need to find a way to sum columns by t...


#### Cleaning
##### Lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue.

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize

df['Title'] =df['Title'].str.replace("[^a-zA-Z#]", " ")
stopwords_list = stopwords.words('english')
punctuations = list(set(string.punctuation))

def clean_text_initial(text):
    text = ' '.join([x.lower() for x in word_tokenize(text) if x.lower() not in stopwords_list and len(x)>1])
    text = ' '.join([x.lower() for x in word_tokenize(text) if x.lower() not in punctuations and len(x)>3])
    return text.strip()

df["clean_text"]=df.Title.apply(lambda text:clean_text_initial(str(text)))
df.head()

[nltk_data] Downloading package stopwords to C:\Users\Jade
[nltk_data]     Watson\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jade Watson\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  df['Title'] =df['Title'].str.replace("[^a-zA-Z#]", " ")


KeyboardInterrupt: 

I decided to lemmanize the titles instaed of stemming them directly because it stems the word but ensures that it does not loose its meaning. Lemmatization has a pre-defined dictionary that stores the context of words and checks the word in the dictionary.

In [None]:
from gensim.corpora.dictionary import Dictionary

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

cleaned_text_list=df.clean_text.apply(lambda clean_text:[lemmatizer.lemmatize(tokenized_text) for tokenized_text in word_tokenize(clean_text)])

gensim_dict=Dictionary(cleaned_text_list)

doc_term_matrix = [gensim_dict.doc2bow(text) for text in cleaned_text_list]

In [None]:
print(df.clean_text)

### Bigrams
##### Used to detect common phrases from a list of sentances

In [None]:
from gensim.models.phrases import Phrases, Phraser

# make list a series
clean_text_series = pd.Series(cleaned_text_list)

sent = [row.split() for row in df.clean_text]
# create the relevant phrases from the list of sentances
phrases = Phrases(sent,min_count=30,progress_per=10000)
# use phraser() to cut down memory consumption of phrases() by discarding model state not needed for bigram detection task
bigram = Phraser(phrases)
# transform the corpus based on the bigrams detected:
sentences = bigram[sent]

### Most Frequent Words
##### Used to check the effectiveness of the lemmatization and addition of bigrams

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

### Training the Model

##### i) Word2Vec(): set up parameters of the model


In [None]:
import multiprocessing
from gensim.models import Word2Vec
# count the number of cores in a computer
cores = multiprocessing.cpu_count()

In [None]:
w2v_model = Word2Vec(min_count=25,window=2,size=300,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=20,workers=cores-1)

##### ii) Build Vocabulary Table: build vocab from a sequence of sentences to initialize the model

In [None]:
# build the vocabulary table
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab table: {} mins'.format(round((time()-t)/60,2)))

##### iii) Train the model

In [None]:
t = time()
# where total_examples is the count of sentences
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# make model more memory-efficient
w2v_model.init_sims(replace=True)

In [None]:
learned_words = list(w2v_model.wv.vocab)
print(learned_words)

In [None]:
# save model
w2v_model.wv.save_word2vec_format('w2v_model.txt')

### Exploring the Model

In [None]:
w2v_model.wv.most_similar(positive=['plots'])

In [None]:
w2v_model.wv.similarity('plot','answer')

### Topic Modelling

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(sentences)
corpus = [dictionary.doc2bow(text) for text in sentences]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
import gensim

NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

In [None]:
# 10 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

### Part 5 Summary

According to our notes, calculating the perplexity is how one chooses the number of topics. To calculate perplexity the inverse log-likelihood of the dictionary above is taken. We wish to have models with lower perplexity because it suggests less uncertainties about the unobserved document. I tested this theory by varying the number of topics. I first tested with 5 topics. It is evident that topic 1 contains words like function, using, loop, and error. Topics 1-4 are printed above as well. Next I tested with 10 topics. Topic 6 contains words like data, table, extract, and remove. The weights in front of each words reflects how important the keyword is to the topic. So the higher the weight the more it contributes to the topic. In the case above, the preferred number of topics is 5. Another approach to finding the optimal number of topics is to build many LDA models with different topic number and pick one that gives the highest coherence value. To do this, you can use the CoherenceModel(). I did not do this because the runtime to build each LDA model is long.