## Get the original data

In [6]:
import pandas as pd

# We obtained these three types through analysis at the beginning, and we will help validate the model later
key_name = ['movi', 'game', 'phone']
# read the raw data
data = pd.read_csv('data/reviews.csv')
# preprocess
data['context'] = data.apply(lambda x : str(x['review_title']) + str(x['review_body']),axis=1)
for import_name in key_name:
    data[f'is_{import_name}'] = data.apply(lambda x : import_name in str(x['review_title']) and import_name in str(x['review_body']),axis=1)
data_text = data[['context']]
data_text['index'] = data_text.index
documents = data_text

print(len(documents))
print(documents[:5])

50000
                                             context  index
0  "The Hunger Games" is a Well-Constructed "Chim...      0
1  GREAT!!!!!this game is the best game I have ev...      1
2  Satisfied CustomersIt's comfortable, it's ligh...      2
3  The Greatest!!!!THIS GAME IS REALLY GREAT YOU ...      3
4  Love the movie!Great Adam Sandler movie, a cla...      4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


## Preprocessing the text, including restoring part of speech and removing stop words

In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

# Preprocessing the text, including restoring part of speech and removing stop words
processed_docs = documents['context'].map(preprocess)
processed_docs[:10]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tengyue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [hunger, game, construct, chimera, movi, larg,...
1    [great, game, best, game, play, kindl, awesom,...
2    [satisfi, customersit, comfort, light, machin,...
3    [greatest, game, great, buy, play, anakin, obi...
4    [love, movi, great, adam, sandler, movi, class...
5    [strong, film, polit, versus, justiceveri, int...
6    [upset, stomachi, understand, product, upset, ...
7    [star, laff, kid, think, movi, suck, ive, see,...
8    [great, littl, kidsmi, granddaught, love, play...
9    [question, answer, great, continu, samara, loo...
Name: context, dtype: object

In [8]:
# Generating word matrix
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

# Design dictionary threshold
dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=100000)
# Generate a corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# Embedding all words
data['bow_vector'] = data['context'].apply(lambda x : dictionary.doc2bow(preprocess(x)))

0 abernathi
1 acquaint
2 add
3 administr
4 adventur
5 afraid
6 agreeabl
7 alic
8 aliv
9 alli
10 amanida


## train the model

In [9]:
# Built-in function, input text to get topic predictions
def test_sign(text):
    score_hi = 0
    topic_hi = -1
    for index, score in sorted(lda_model[text], key=lambda tup: -1*tup[1]):
        if score > score_hi:
            topic_hi = index
            score_hi = score
    return topic_hi

In [10]:
# The optimal number of classification was obtained by model evaluation
score_h = 0
best_type = 2
for i in range(3,20):
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=i, id2word=dictionary, passes=2, workers=2)
    data['topic'] = data['bow_vector'].apply(lambda x : test_sign(x))
    game_topic = data[data['is_game'] == True]['topic'].value_counts()
    movie_topic = data[data['is_movi'] == True]['topic'].value_counts()
    phone_topic = data[data['is_phone'] == True]['topic'].value_counts()
    main_game_topic = game_topic.idxmax()
    main_movie_topic = movie_topic.idxmax()
    main_phone_topic = phone_topic.idxmax()
    if main_game_topic != main_movie_topic and main_movie_topic != main_phone_topic:
        score_tp = (game_topic.max() / game_topic.sum()) + (main_movie_topic.max() / main_movie_topic.sum()) + (main_phone_topic.max() / main_phone_topic.sum())
        if score_tp > score_h:
            score_h = score_tp
            best_type = i

# show the best performance and the number of the types
print(score_h)
print(best_type)

In [11]:
# train the model with the best types number
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=best_type, id2word=dictionary, passes=2, workers=2)

## predict and show the topic

In [12]:
# predict the topic
data['topic'] = data['bow_vector'].apply(lambda x : test_sign(x))
# show each topic's key words
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.051*"game" + 0.028*"movi" + 0.021*"like" + 0.020*"play" + 0.019*"great" + 0.017*"love" + 0.017*"fun" + 0.016*"good" + 0.013*"time" + 0.009*"app"
Topic: 1 
Words: 0.009*"film" + 0.006*"like" + 0.005*"match" + 0.005*"movi" + 0.005*"song" + 0.005*"album" + 0.005*"time" + 0.005*"good" + 0.005*"best" + 0.004*"man"
Topic: 2 
Words: 0.028*"quot" + 0.010*"game" + 0.008*"like" + 0.006*"new" + 0.006*"time" + 0.005*"play" + 0.004*"charact" + 0.004*"world" + 0.004*"way" + 0.004*"level"
Topic: 3 
Words: 0.015*"work" + 0.014*"use" + 0.011*"great" + 0.009*"like" + 0.009*"good" + 0.008*"phone" + 0.008*"product" + 0.007*"case" + 0.007*"buy" + 0.007*"need"
Topic: 4 
Words: 0.015*"like" + 0.013*"season" + 0.012*"good" + 0.012*"great" + 0.011*"love" + 0.009*"product" + 0.008*"episod" + 0.008*"dog" + 0.007*"dvd" + 0.007*"buy"
Topic: 5 
Words: 0.025*"film" + 0.021*"movi" + 0.008*"stori" + 0.007*"watch" + 0.007*"charact" + 0.007*"like" + 0.006*"time" + 0.006*"great" + 0.006*"love" + 0.005*

## merge the same product_id

In [13]:
# check the original prediction
data['topic'].value_counts()

0    15891
5    12256
3     9093
4     7859
1     3415
2     1486
Name: topic, dtype: int64

In [14]:
from tqdm import tqdm
pro_id = data['product_id'].unique()
bar = tqdm(total=len(pro_id))
for i in pro_id:
    temp_df = data[data['product_id'] == i]
    r_ture = temp_df['topic'].value_counts().idxmax()
    data[data['product_id'] == i]['topic'] = r_ture
    bar.update(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
100%|█████████▉| 29643/29660 [02:32<00:00, 189.99it/s]

In [15]:
# check the prediction after merge
data['topic'].value_counts()

0    15891
5    12256
3     9093
4     7859
1     3415
2     1486
Name: topic, dtype: int64

In [16]:
# sort and save
data = data.sort_values('review_id')
save_df = data[['review_id', 'topic']]
save_df.columns = ['review_id', 'product_category']
save_df.to_csv('task1a.csv', index=None)