# Products

In [1]:
# import libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns

import plotly.express as px
import re


# Source Code
products_raw = pd.read_csv("ProductRaw.csv",error_bad_lines=False)
reviews_raw = pd.read_csv('ReviewRaw.csv',error_bad_lines=False)
#--------------

# PREPROCESSING

# Drop duplicate if data have
products = products_raw.drop_duplicates()

# Droping duplicate values
reviews = reviews_raw.drop_duplicates()

# Remove feature 'full_name' and 'created_time'
reviews.drop(['full_name','created_time'],axis=1,inplace=True)

# Combine products and reviews together
reviews = reviews[reviews.product_id.isin(products.item_id)]

# Reset index reviews
reviews.reset_index(drop=True,inplace=True)

# Save to file
reviews.to_csv('Review_new.csv')
products.to_csv('Product_new.csv')

# Load new csv file after preprocessing

new_products = pd.read_csv('Product_new.csv',lineterminator='\n')
new_reviews = pd.read_csv('Review_new.csv',lineterminator='\n')

# Exploding the data

# pd.options.display.float_format = '{:,.2f}'.format

# Brand
# brands = new_products.groupby('brand')['item_id'].count().sort_values(ascending=False)

# group rating in product's dataset
# new_products.groupby(['rating'])['item_id'].count().head(5)

# Product rating by customer's review
# avg_rating_customer = new_reviews.groupby(by='product_id').mean()['rating'].to_frame().reset_index()
# avg_rating_customer.rename({'rating':'avg_rating'},axis=1,inplace=True)

# Top 20 customer make review
# top_rating_customers = new_reviews.groupby('customer_id').count()['product_id'].sort_values(ascending=False)[:20]





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2]:

# Contentbased solution

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from underthesea import word_tokenize, pos_tag, sent_tokenize
import warnings
warnings.filterwarnings('ignore')

new_products = new_products[new_products['name'].notnull()]
new_products['name_description'] = new_products['name'] + new_products['description']
products = new_products[new_products['name_description'].notnull()]
products['name_description_pre'] = products['name_description']\
    .apply(lambda x: word_tokenize(x, format='text'))
products = products.reset_index()

# Stop words
STOP_WORD_FILE = 'vietnamese-stopwords.txt'

with open(STOP_WORD_FILE,'r',encoding='utf-8') as file:
    stop_words = file.read()

stop_words = stop_words.split('\n')

# TF-IDF
tf = TfidfVectorizer(analyzer='word',min_df=0,stop_words=stop_words)

tfidf_matrix = tf.fit_transform(products.name_description_pre)

In [3]:

# Solution 2: Gensim

from gensim import corpora, models, similarities
import jieba
import re

# Preprocess

products['name_description_pre'].replace(["[,-/–]","[().\d]","^\['|'\]$"],"",regex=True,inplace=True)

# Tokenize(split) the sentences into words
intro_products = [[text for text in x.split()] for x in products.name_description_pre]

# Remove ' ' elements in text
intro_products_re = [[t.lower() for t in text if not t in [" ","±","","?",":","…","•","[","]","...","≥","%","”","“","&","*","\ |\?|\.|\!|\/|\;|\:"]] for text in intro_products]

# Obtain the number of features based on dictionary: Use corpora.Dictionary
dictionary = corpora.Dictionary(intro_products_re)

# List of features in dictionary
dic_token_2id = dictionary.token2id

# Number of features (word) in dictionary
feature_cnt = len(dic_token_2id)

# Obtain corpus based on dictionary (dense matrix)
corpus = [dictionary.doc2bow(text) for text in intro_products_re]
# Use TF-IDF model to process corpus, obtaining index
tfidf = models.TfidfModel(corpus)
# Couting similarity in sparse matrix
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)

2021-12-14 09:49:26.061 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2021-12-14 09:49:27.805 INFO    gensim.corpora.dictionary: built Dictionary(36940 unique tokens: ['airpod', 'apple', 'bao_gồm', 'bluetooth', 'bluetooth_inpods']...) from 4370 documents (total 1529910 corpus positions)
2021-12-14 09:49:27.806 INFO    gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary(36940 unique tokens: ['airpod', 'apple', 'bao_gồm', 'bluetooth', 'bluetooth_inpods']...) from 4370 documents (total 1529910 corpus positions)", 'datetime': '2021-12-14T09:49:27.806288', 'gensim': '4.1.2', 'python': '3.6.8 (tags/v3.6.8:3c6b436a57, Dec 24 2018, 00:16:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2021-12-14 09:49:28.828 INFO    gensim.models.tfidfmodel: collecting document frequencies
2021-12-14 09:49:28.829 INFO    gensim.models.tfidfmodel: PROGRESS: processing document #0
2021-12-14 09:49:29.165 INFO

In [4]:
products.to_csv('final_product.csv')

In [5]:
import pickle
file_name_dictionary = 'Dictionary.sav'
pickle.dump(dictionary, open(file_name_dictionary, 'wb'))

In [6]:
file_name_tfidf = 'TfidfModel.sav'
pickle.dump(tfidf, open(file_name_tfidf, 'wb'))

In [7]:
file_name_index = 'Index.sav'
pickle.dump(index, open(file_name_index, 'wb'))