In [None]:
import ast
import pandas as pd
import numpy as np
import time
import os, sys, glob
import re
from ast import literal_eval
from datetime import datetime
import tensorflow as tf
import matplotlib.pyplot as plt
% matplotlib inline
from IPython.display import display

pd.set_option('display.max_colwidth', -1)
from collections import Counter
from gensim.corpora.dictionary import Dictionary
import gensim
from gensim.test.utils import datapath
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import warnings
warnings.filterwarnings('ignore')

# Read data from google cloud storage

In [None]:
#read files
def read_gs(path, id_name):
    dev_file = tf.gfile.Glob(path)[0]
    df = pd.read_csv(tf.gfile.Open(dev_file, mode='rb'), 
                     escapechar='\\', 
                     error_bad_lines=False, 
                     header=0, 
                     dtype={'original_id': str})
    df.rename(columns={'original_id': id_name}, inplace=True)
    
    return df

path = 'gs://directory/'
shop_dir = path + '/*.csv'
shopDf = read_gs(shop_dir, "shop_id")

# Data Preprocessing

In [None]:
#replace column with another column according to condition given
shopDf = shopDf[shopDf['tags_th'] != "['null']"]
shopDf['tags_th'] = np.where(shopDf['tags_th'] == '[]', shopDf['tags_en'], shopDf['tags_th'])
shopDf['title_th'] = np.where(shopDf['title_th'].isnull(), shopDf['title_en'], shopDf['title_th'])

tags_en = shopDf['tags_en']
tags_th = shopDf['tags_th']

def literal(i):
    return literal_eval(i.replace("['","[\"").replace("']", "\"]").replace("','", "\",\""))

tags = tags_th.map(literal)
print (len(tags))

In [None]:
def doc_lens(doc):
    
    document_lengths = np.array(list(map(len, doc)))
    print("The average number of words in a document is: {}.".format(np.mean(document_lengths)))
    print("The minimum number of words in a document is: {}.".format(min(document_lengths)))
    print("The maximum number of words in a document is: {}.".format(max(document_lengths)))
    
    return document_lengths

In [None]:
document_lengths = doc_lens(tags)
shorten_length = 30
print("There are {} documents with over {} words.".format(sum(document_lengths > shorten_length), shorten_length))
shorter_documents = document_lengths[document_lengths <= shorten_length]

### Remove word that ends with s
#### Don't want to apply stemming and lematization because it effects shop's name

In [None]:
def filter_word(documents):
#     pattern_s = "([^e+u+s\s])s$"
    pattern_s = "([^s\s])s$"
    for i in documents:
        for j, item in enumerate(i):
            item = item.lower()
            if re.search(re.compile(pattern_s), item) != None:
                item = item[:-1]
            i[j] = item

    return documents

tags = filter_word(tags)

### Tokenize the text using nltk's word tokenize

In [None]:
import nltk
nltk.download('punkt')

documents = []
for tag in tags:
    g = []
    for word in tag:
        #Tokenize a string to split off punctuation other than periods
        words = nltk.word_tokenize(word)
        for current_word in words:
            current_word = current_word.lower()
            g.append(current_word)
    documents.append(g)

In [None]:
document_lengths = doc_lens(documents)

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,6))

ax.set_title("Distribution of number of words", fontsize=16)
ax.set_xlabel("Number of words")
sns.distplot(document_lengths, bins=50, ax=ax);

In [None]:
exclude_len = 30
print("There are {} documents with over {} words.".format(sum(document_lengths > exclude_len), exclude_len))

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,6))

ax.set_title("Distribution of number of words", fontsize=16)
ax.set_xlabel("Number of words")
sns.distplot(shorter_documents);

# Show two-word document
The shorter documents will probably be harder to classify since we'll have less words to cling to. LDA for example tries to find topics in documents, but if the documents are so short, perhaps it will find it hard to really find a topic in a two-word document.

In [None]:
[i for i in documents if len(i) <= 2]

## Plot total number of words

In [None]:
allword = flatten(documents)
print (len(allword))
print (len(set(allword)))
word = 300
most_common = Counter(allword).most_common(word)
least_common = Counter(allword).most_common()[-word-1:-1]
x = [i[0] for i in most_common]
y = [i[1] for i in most_common]

fig, ax = plt.subplots(figsize=(15,6))
ax.set_title("Distribution of word frequency", fontsize=16)
ax.set_xlabel("word")
sns.barplot(x,y)
# most_common

In [None]:
most_common[:5]

# From above chart, we choose more than 3 frequency of word

In [None]:
from collections import defaultdict

frequency = defaultdict(int)
for text in documents:
    for token in text:
        frequency[token] += 1
        
documents1 = [[token for token in text if frequency[token] >= 3 ] for text in documents]
flat_doc1 = flatten(documents1)
print ('Lenghts of total word {}'.format(len(flat_doc1)))
print ('Lengths unique word {}'.format(len(set(flat_doc1))))

# Remove word less than 3 

In [None]:
import nltk
nltk.download('punkt')

documents_cha = []
for tag in documents:
    g = []
    for word in tag:
        if len(word) >= 3 or word == "ยา" or word == "ชา":
            g.append(word)
    documents_cha.append(g)

In [None]:
from collections import defaultdict

frequency = defaultdict(int)
for text in documents_cha:
    for token in text:
        frequency[token] += 1
        
documents_freq = [[token for token in text if frequency[token] >= 3] for text in documents_cha]

flat_doc1 = flatten(documents_freq)
print ('Lenghts of total word {}'.format(len(flat_doc1)))
print ('Lengths unique word {}'.format(len(set(flat_doc1))))

Remove unwanted word those are Thailand province, district, and road in both Thai and English that are contained in the documents.

In [None]:
province = np.load('province.npy')
print (len(province))
district = np.load('district.npy')
road = np.load('road.npy')
exclude_word = list(province) + list(district) + list(road)

## Create Dictionary and Corpus needed for LDA -Topic Modeling

Dictionary is a unique id for each word in the document.

Corpus is a mapping of (word_id, word_frequency).

Remove exclude from dictionary

In [None]:
#Create Dictionary
dictionary = Dictionary.from_documents(documents_freq)
# ignore words that appear in less than 3 documents or more than 80% documents
dictionary.filter_extremes(no_below=3, no_above=0.9)
dictionary.compactify()
# remove exclude word from dictionary 
del_ids = [k for k,v in dictionary.items() if v in exclude_word]
dictionary.filter_tokens(bad_ids=del_ids)
dictionary.compactify()
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Visualize the cleansed words frequencies

In [None]:
cleansed_words_df = pd.DataFrame.from_dict(dictionary.token2id, orient='index')
cleansed_words_df.rename(columns={0: 'id'}, inplace=True)
cleansed_words_df['count'] = list(map(lambda id_: dictionary.dfs.get(id_), cleansed_words_df.id))
cleansed_words_df.drop(['id'], axis=1, inplace=True)
cleansed_words_df.sort_values('count', ascending=False, inplace=True)

In [None]:
def word_frequency_barplot(df, nr_top_words=50):
    """ df should have a column named count.
    """
    fig, ax = plt.subplots(1,1,figsize=(20,5))
    sns.barplot(list(range(nr_top_words)), df['count'].values[:nr_top_words], palette='hls', ax=ax)
    
    ax.set_xticks(list(range(nr_top_words)))
    ax.set_xticklabels(df.index[:nr_top_words], fontsize=14, rotation=90)
    return ax

In [None]:
cleansed_words_df.head()

In [None]:
import warnings
warnings.filterwarnings('ignore')

ax = word_frequency_barplot(cleansed_words_df)
ax.set_title("Document Frequencies (Number of documents a word appears in)", fontsize=16);

## Tf-idf score
Tf-idf reflects how important a word is to a document in a collection or corpus.

The higher the Tf-idf score (weight), the rarer the term and vice versa.

In [None]:
tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
low_value = 0.1
low_value_words = []
for bow in corpus:
    low_value_words += [id for id, value in tfidf_model[bow] if value < low_value]

#filter value tfidf > 0.9 (unique words that we don't want to feed in LDA)
dictionary.filter_tokens(bad_ids=low_value_words)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Building LDA Model
In addition to the corpus and dictionary, you need to provide the number of topics as well.
LDA model is built with 5 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

The weights reflect how important a keyword is to that topic.

# Find the optimal number of topics for LDA
Build many LDA models with different values of number of topics and pick the one that gives the highest coherence value.

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model= gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=num_topics,  
                                           id2word=dictionary,
                                            )
        
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Plot coherence score to choose the best number of topics

In [None]:
start=2
limit=20
step=3
corpus = corpus
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, 
                                                        texts=documents, start=start, limit=limit, step=step)
# Print the coherence scores
print ('Plotting graph')
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv in zip(x, coherence_values):
    round_cv = round(cv, 4)
    print("Num Topics =", m, " has Coherence Value of", round_cv)

# Choose number of topic that have the best coherence score 

In [None]:
import gensim
from gensim.test.utils import datapath

optimal_model = gensim.models.ldamodel.LdaModel.load("lda.model")

In [None]:
model = model_list[2]
#save model
model.save('lda.model')
model_topics = model.show_topics(formatted=True)
print (len(model_topics))
model_topics

# Compute Model Perplexity and Coherance Score
The higher coherance Score, the better model.

In [None]:
# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualize the topics keywords
A good topic model should have non-overlapping cluster.

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(lda_display)

# Look into LDA result 

In [None]:
shopDf['group'] = [max(model[i], key=lambda x:x[1])[0] for i in corpus]
shopDf['prob'] = [max(model[i], key=lambda x:x[1])[1] for i in corpus]
shopDf['tags_other'] = [max(model[i], key=lambda x:x[1])[0] if (max(model[i], key=lambda x:x[1])[1] > 0.6) else 99 for i in corpus ]

In [None]:
def result(tag_group, column, shopDf):
    
    result = shopDf[shopDf[column] == tag_group][['merchant_id', 'title_th', 'tags_th']]
    print ('Number of result is...\n{}'.format(result.count()))
    
    return result

In [None]:
res = result(0, 'tags_other', shopDf)
res.head()

# Selecting top threshold of probability of each word assigned to a topic or select top n word 

In [None]:
def title_exact_match(data):

    for cate in SHOP.keys():
        for val in SHOP[cate]:
            if re.search(r'\b{0}\b'.format(val.lower()), data.lower()):
                return cate
    return 99

def title_some_match(data):

    for cate in SHOP.keys():
        for val in SHOP[cate]:
            if val.lower() in data.lower():
                return cate
    return 99

def percent_trash():
    shopDf['group'] = shopDf['tags_th'].map(title_exact_match)
    #replace bin 99 with group that have matched word
    shopDf['group'] = np.where(shopDf['group'] == 99, shopDf['tags_th'].map(title_some_match), shopDf['group'])
    extra_bin = shopDf[shopDf['group'] == 99]['merchant_id'].count()
    prop_trash = extra_bin / total_doc 
    
    return prop_trash

def plot_word_trash(word_range, trash):
    plt.plot(word_range, trash, color='b')
    plt.xlabel('Top n')
    plt.ylabel('Proportion of trash to total number of document')

### top n word

In [None]:
trash = []
total_doc = shopDf['merchant_id'].count()
range_word = range(1, 10)
for topword in range_word:
    keyword = []
    for t in range(len(model_topics)):
        wordlist = (model.show_topic(t, topword))
        keyword.append([x for x,_ in wordlist])
    SHOP = dict(enumerate(keyword, start=1))
    print ('\nTop {} word...'.format(topword))
    print (SHOP)

    prop_trash = percent_trash()
    trash.append(prop_trash)

plot_word_trash(range_word, trash)

# top n threshold

In [None]:
keyword = []
trash = []
threshold = [0.04, 0.03, 0.02]
for i in threshold:
    for t in range(len(model_topics)):
        wordlist = (model.show_topic(t, topn=20))
        keyword.append([x for x, y in wordlist if y > i])
    SHOP = dict(enumerate(keyword, start=1))
    print ('\nTop {} threshold...'.format(i))
    print (SHOP)

    prop_trash = percent_trash()
    trash.append(prop_trash)
plot_word_trash(threshold, trash)