In [None]:
import numpy as np
import pandas as pd
import nltk
# import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load data into dataframe
df = pd.read_csv('~/NLP/Data/watch_reviews.tsv', sep='\t', error_bad_lines=False)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Remove missing value
df.dropna(subset=['review_body'],inplace=True)

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# use the first 1000 data as our training data
data = df.loc[:999, 'review_body'].tolist()

In [None]:
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english') #stopwords.append("n't")
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("br") #html <br>
stopwords.append("watch")

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:10])

In [None]:
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # exclude stop words and tokenize the document, generate a list of string 
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
tokenization_and_stemming(data[0])

In [None]:
# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
# TfidfVectorizer will help us to create tf-idf matrix
# max_df : maximum document frequency for the given word
# min_df : minimum document frequency for the given word
# max_features: maximum number of words
# use_idf: if not true, we only calculate tf
# stop_words : built-in stop words
# tokenizer: how to tokenize the document
# ngram_range: (min_value, max_value), eg. (1, 3) means the result will include 1-gram, 2-gram, 3-gram
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")

In [None]:
tfidf_matrix

In [None]:
tfidf_matrix.toarray() 

In [None]:
tfidf_matrix.todense()

In [None]:
print(type(tfidf_matrix.toarray()))

In [None]:
print(type(tfidf_matrix.todense()))

In [None]:
# K means 
# 缺点：分类不平均，有些分类比较大，有些比较小

In [None]:
# k-means clustering
from sklearn.cluster import KMeans

num_clusters = 5

# number of clusters
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
# create DataFrame films from all of the input files.
product = { 'review': df[:1000].review_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [None]:
frame.head(10)

In [None]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

In [None]:
km.cluster_centers_

In [None]:
km.cluster_centers_.shape

In [None]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5)

In [None]:
# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

In [None]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

In [None]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]
print(topic_names)

In [None]:
# index names
doc_names = ["Doc" + str(i) for i in range(len(data))]

In [None]:
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

In [None]:
df_document_topic['topic'].value_counts().to_frame()

In [None]:
# Topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# Column names
df_topic_words.columns = tfidf_model.get_feature_names_out()

# Index names
df_topic_words.index = topic_names

df_topic_words.head()

In [None]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words