# Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud
import time
import re

input_path = 'inputs\\data\\amazon'
output_path = 'outputs\\data\\amazon'

# Data overview

In [2]:
# Load data 
df = pd.read_csv(f'{input_path}\\amazon.csv')

In [None]:
productTable = df[['product_id', 'product_name', 'about_product', 'discounted_price', 'actual_price']]

In [229]:
len(df)

1465

Those needed to be cleaned:
- Drop duplicates
- Money sign drop 
- Negative prices?
- Clean product name
- clean category
- clean about product
- clean user id and user name
- review content and review title
- drop image link and product link (these used for website)

In [4]:
df['rating'].unique()

array(['4.2', '4.0', '3.9', '4.1', '4.3', '4.4', '4.5', '3.7', '3.3',
       '3.6', '3.4', '3.8', '3.5', '4.6', '3.2', '5.0', '4.7', '3.0',
       '2.8', '4', '3.1', '4.8', '2.3', '|', '2', '3', '2.6', '2.9'],
      dtype=object)

In [5]:
df['rating_count'].unique()

array(['24,269', '43,994', '7,928', ..., '468', '8,031', '6,987'],
      dtype=object)

In [6]:
df_clean = df.copy()

In [7]:
def print_end():
    print('-------------------------------------------------------------')

In [8]:
def drop_duplicates(df):
    print('Drop duplicates processing...')
    start = time.time()
    num_duplicates = len(df[df.duplicated(keep=False)])
    if num_duplicates > 0:
        print('Found {:d} duplicate rows'.format(num_duplicates))
        # Drop duplicates
        df = df.drop_duplicates()
    else:
        print('Found no duplicate.')
    end = time.time()
    print('End drop duplicates. Finished in {0:.3f}s.'.format(end - start))
    print_end()

In [9]:
def number_taker(string):

    # if already number
    if not isinstance(string, str):
        return string
    
    # take numbers
    numbers = re.findall(r'\d+\.\d+|\d+', string)
    numbers = list(map(str, numbers))
    # print(numbers)
    
    if len(numbers)>=1:
        # map numbers
        result = float(''.join(map(str, numbers)))
        return result
    
    else:  
        return None
    
print(number_taker('$334,2.99'))

3342.99


In [10]:
def numeric_processing(df):
    print('Numeric processing...')

    start=time.time()
    df['discounted_price'] = df['discounted_price'].apply(number_taker)
    df['actual_price'] = df['actual_price'].apply(number_taker)
    df['discount_percentage'] = df['discount_percentage'].apply(number_taker)/100
    df['rating'] = df['rating'].apply(number_taker)
    df['rating_count'] = df['rating_count'].apply(number_taker)
    end=time.time()

    print('Processing price successfully. Finished in {:.3f}s'.format(end-start))
    print_end()

In [11]:
# drop duplicates
drop_duplicates(df_clean)

Drop duplicates processing...
Found no duplicate.
End drop duplicates. Finished in 0.023s.
-------------------------------------------------------------


In [12]:
numeric_processing(df_clean)

Numeric processing...
Processing price successfully. Finished in 0.022s
-------------------------------------------------------------


In [13]:
df_clean.loc[df_clean['rating'].isna()]

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
1279,B08L12N5H1,Eureka Forbes car Vac 100 Watts Powerful Sucti...,"Home&Kitchen|Kitchen&HomeAppliances|Vacuum,Cle...",2099.0,2499.0,0.16,,992.0,No Installation is provided for this product|1...,"AGTDSNT2FKVYEPDPXAA673AIS44A,AER2XFSWNN4LAUCJ5...","Divya,Dr Nefario,Deekshith,Preeti,Prasanth R,P...","R2KKTKM4M9RDVJ,R1O692MZOBTE79,R2WRSEWL56SOS4,R...","Decent product,doesn't pick up sand,Ok ok,Must...","Does the job well,doesn't work on sand. though...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Eureka-Forbes-Vacuum-Cle...


In [14]:
# changing rating for nan
df_clean['rating'] = df_clean['rating'].fillna(4.0)

In [15]:
# Show dfset information
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_id           1465 non-null   object 
 1   product_name         1465 non-null   object 
 2   category             1465 non-null   object 
 3   discounted_price     1465 non-null   float64
 4   actual_price         1465 non-null   float64
 5   discount_percentage  1465 non-null   float64
 6   rating               1465 non-null   float64
 7   rating_count         1463 non-null   float64
 8   about_product        1465 non-null   object 
 9   user_id              1465 non-null   object 
 10  user_name            1465 non-null   object 
 11  review_id            1465 non-null   object 
 12  review_title         1465 non-null   object 
 13  review_content       1465 non-null   object 
 14  img_link             1465 non-null   object 
 15  product_link         1465 non-null   o

In [16]:
# Summarize the training set
df_clean.describe()

Unnamed: 0,discounted_price,actual_price,discount_percentage,rating,rating_count
count,1465.0,1465.0,1465.0,1465.0,1463.0
mean,3125.310874,5444.990635,0.476915,4.096519,18295.541353
std,6944.304394,10874.826864,0.216359,0.291585,42753.864952
min,39.0,39.0,0.0,2.0,2.0
25%,325.0,800.0,0.32,4.0,1186.0
50%,799.0,1650.0,0.5,4.1,5179.0
75%,1999.0,4295.0,0.63,4.3,17336.5
max,77990.0,139900.0,0.94,5.0,426973.0


In [17]:
def fix_strings(string):
    result = re.split(r'(?=[A-Z&])', string)
    result = ' '.join(result)
    return result

In [18]:
def split_category(df):
    print('Splitting category...')

    start=time.time()
    catsplit = df['category'].str.split('|', expand=True)
    catsplit = catsplit.rename(columns={0:'category_1', 1:'category_2'})

    # adding only category_1 and category_2
    if 'category_1' not in df.columns:
        df['category_1'] = catsplit['category_1']
        df['category_1'] = df['category_1'].apply(fix_strings)
    if 'category_2' not in df.columns:
        df['category_2'] = catsplit['category_2']
        df['category_2'] = df['category_2'].apply(fix_strings)
    if 'category' in df.columns:
        df.drop(columns='category', inplace=True)

    end=time.time()
    print('End split category. Finished in {0:.3f}s.'.format(end - start))
    print_end()

In [19]:
def price_difference(df):
    print('Calculating price differnce...')

    start = time.time()
    if 'price_difference' not in df.columns:
        df['price_difference'] = df['actual_price'] - df['discounted_price']

    end=time.time()
    print('End calculating price difference. Finished in {0:.3f}s.'.format(end - start))
    print_end()

In [20]:
split_category(df_clean)

Splitting category...
End split category. Finished in 0.025s.
-------------------------------------------------------------


In [21]:
def remove_id_space(df):
    df['product_id'].str.strip()
    df['user_id'].str.strip()

In [22]:
# remove space
df_clean['product_id'].str.strip()

0       B07JW9H4J1
1       B098NS6PVG
2       B096MSW6CT
3       B08HDJ86NZ
4       B08CF3B7N1
           ...    
1460    B08L7J3T31
1461    B01M6453MB
1462    B009P2LIL4
1463    B00J5DYCCA
1464    B01486F4G6
Name: product_id, Length: 1465, dtype: object

In [23]:
def rating_score(df):
    print('Rating score processing...')

    start=time.time()
    def score_cat(score):
        if score < 2.0 : return 'Poor'
        elif score < 3.0 : return 'Below Average'
        elif score < 4.0 : return 'Average'
        elif score < 5.0 : return 'Above Average'
        elif score == 5.0 : return 'Excellent'

    if 'rating_score' not in df.columns:
        df['rating_score'] = df['rating'].apply(score_cat)
        df['rating_score'] = df['rating_score'].astype('category')
        df['rating_score'] = df['rating_score'].cat.reorder_categories(['Below Average', 'Average', 'Above Average', 'Excellent'], ordered=True)

    end=time.time()
    print('End split category. Finished in {0:.3f}s.'.format(end - start))
    print_end()
        

In [24]:
rating_score(df_clean)

Rating score processing...
End split category. Finished in 0.004s.
-------------------------------------------------------------


In [25]:
price_difference(df_clean)

Calculating price differnce...
End calculating price difference. Finished in 0.000s.
-------------------------------------------------------------


In [26]:
def merge_dataframe(data_list, on=None, suffixes=('_x', '_y')):
    if on==None:
        raise AttributeError('Must be specify the merging key!')

    df_merge = data_list[0]
    for df in data_list[1:]:
        df_merge = df_merge.merge(df, on=on, how='left', suffixes=suffixes)
        
    return df_merge

In [27]:
def user_info(df):
    # split user_id
    reviewer_id_split = df['user_id'].str.split(',', expand=False)
    reviewer_id_exp = reviewer_id_split.explode()
    reviewer_id_clean = reviewer_id_exp.reset_index(drop=True)

    # split user_name
    reviewer_name_split = df['user_name'].str.split(',', expand=False)
    review_name_exp = reviewer_name_split.explode()
    reviewer_name_clean = review_name_exp.reset_index(drop=True)

    reviewer_id_clean = pd.DataFrame(reviewer_id_clean)
    reviewer_name_clean = pd.DataFrame(reviewer_name_clean)

    return pd.merge(reviewer_id_clean, reviewer_name_clean, left_index=True, right_index=True)

In [28]:
def seperate_user(df):
	df_new = df.copy()
	df_new['user_id'] = df_new['user_id'].str.split(',')
	df_new['user_name'] = df_new['user_name'].str.split(',')
	return df_new.explode(['user_id','user_name'])
	

In [29]:
def count_commas(string):
    return string.count(',')

In [30]:
df_clean['is_same'] = df_clean['user_id'].apply(count_commas).astype(int) - df_clean['user_name'].apply(count_commas).astype(int)
df_clean = df_clean.loc[df_clean.is_same!=-1]
df_clean.drop(columns=['is_same'], inplace=True)
df_clean = seperate_user(df_clean)

In [32]:
productTable = df_clean[['product_id', 'product_name', 'about_product', 'category_1', 'category_2', 'discounted_price', 'actual_price', 
                         'discount_percentage', 'rating', 'rating_count', 'img_link', 'product_link']]

In [51]:
productTable.drop_duplicates(subset=['product_name'], keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  productTable.drop_duplicates(subset=['product_name'], keep='first', inplace=True)


In [53]:
productTable.to_csv('outputs\\data\\amazon\\amazon-product.csv', index=False)

In [259]:
user_count = df_clean.user_id.value_counts().reset_index()
user_count.loc[user_count['count']!=1]

Unnamed: 0,user_id,count
0,AE55KTFVNXYFD5FPYWP2OUPEYNPQ,11
1,AG5DWPD54QGSLWJ6QUFERLPNAX4Q,11
2,AEBWA5I4QFCA3P3OBEPMELBGN4GQ,10
3,AHMGAC6QM62UXNEOCZIHLHSXPP2Q,10
4,AFHROSCGIXUPV3FYQ7H5QOD46Q7Q,10
...,...,...
1290,AGYTCTSUZJJZTK2XVADTQI5MYUFQ,2
1291,AELBYFRFAGLMXQQJKVDUWO7QX2VQ,2
1292,AGKNFVSMZCSEFHPASWFBOIYKRZJA,2
1293,AERBQW23ELEQZRWXWOW5EFQ2AA7Q,2


In [260]:
data_clean = pd.read_csv(f'{output_path}\\Sales_Amazon_Cleaned_final.csv')

In [261]:
data_clean

Unnamed: 0.1,Unnamed: 0,user_id,product_id,user_name,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,category_1,category_2,rating_score,difference_price,neg,neu,pos,compound
0,0,AG3D6O4STAQKAY2UVGEUV46KN35Q,B07JW9H4J1,Manav,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
1,1,AG3D6O4STAQKAY2UVGEUV46KN35Q,B07JW9H4J1,Manav,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24270.0,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
2,2,AG3D6O4STAQKAY2UVGEUV46KN35Q,B07JW9H4J1,Manav,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
3,3,AG3D6O4STAQKAY2UVGEUV46KN35Q,B07JW9H4J1,Adarsh gupta,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
4,4,AG3D6O4STAQKAY2UVGEUV46KN35Q,B07JW9H4J1,Adarsh gupta,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24270.0,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151804,151804,AGEFL3AY7YXEFZA4ZJU3LP7K7OJQ,B01486F4G6,PARDEEP,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
151805,151805,AGEFL3AY7YXEFZA4ZJU3LP7K7OJQ,B01486F4G6,Anindya Pramanik,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
151806,151806,AGEFL3AY7YXEFZA4ZJU3LP7K7OJQ,B01486F4G6,Vikas Singh,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
151807,151807,AGEFL3AY7YXEFZA4ZJU3LP7K7OJQ,B01486F4G6,Harshada Pimple,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915


In [262]:
product_ids1 = df_clean.product_id.unique()

In [263]:
len(product_ids1)

1342

In [264]:
feature_product = data_clean[['product_id', 'neg', 'neu', 'pos', 'compound']]

In [265]:
feature_product.drop_duplicates(inplace=True)
len(feature_product)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_product.drop_duplicates(inplace=True)


1351

In [266]:
product_ids2 = feature_product.product_id.unique()

In [267]:
for id in product_ids2:
    if id not in product_ids1:
        print(id)
        

B08Y1TFSP6
B08Y1SJVV5
B0B9XN9S3W
B0981XSZJ7
B08Y5KXR6Z
B084N18QZY
B07T9FV9YP
B097R2V1W8
B07MKMFKPG


In [269]:
df_clean = merge_dataframe([df_clean, feature_product], on='product_id')

In [270]:
df_clean

Unnamed: 0,product_id,product_name,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,...,img_link,product_link,category_1,category_2,rating_score,price_difference,neg,neu,pos,compound
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AG3D6O4STAQKAY2UVGEUV46KN35Q,Manav,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
1,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AHMY5CWJMMK5BJRBBSNLYT3ONILA,Adarsh gupta,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
2,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AHCTC6ULH4XB6YHDY6PCH2R772LQ,Sundeep,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
3,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AGYHHIERNXKA6P5T7CZLXKVPT7IQ,S.Sayeed Ahmed,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
4,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,399.0,1099.0,0.64,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q,jaspreet singh,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Computers & Accessories,Accessories & Peripherals,Above Average,700.0,0.032,0.744,0.224,0.9033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11402,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,"Brand-Borosil, Specification â€“ 23V ~ 5Hz;1 W...",AHXCDNSXAESERITAFELQABFVNLCA,PARDEEP,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Borosil-Jumbo-1000-Watt-...,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
11403,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,"Brand-Borosil, Specification â€“ 23V ~ 5Hz;1 W...",AGRZD6CHLCUNOLMMIMIHUCG7PIFA,Anindya Pramanik,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Borosil-Jumbo-1000-Watt-...,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
11404,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,"Brand-Borosil, Specification â€“ 23V ~ 5Hz;1 W...",AFQZVGSOSOJHKFQQMCEI4725QEKQ,Vikas Singh,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Borosil-Jumbo-1000-Watt-...,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915
11405,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,2863.0,3690.0,0.22,4.3,6987.0,"Brand-Borosil, Specification â€“ 23V ~ 5Hz;1 W...",AEALVGXXIP46OZVXKRUXSDWZJMEA,Harshada Pimple,...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Borosil-Jumbo-1000-Watt-...,Home & Kitchen,Kitchen & Home Appliances,Above Average,827.0,0.007,0.847,0.146,0.9915


In [272]:
df_clean.columns

Index(['product_id', 'product_name', 'discounted_price', 'actual_price',
       'discount_percentage', 'rating', 'rating_count', 'about_product',
       'user_id', 'user_name', 'review_id', 'review_title', 'review_content',
       'img_link', 'product_link', 'category_1', 'category_2', 'rating_score',
       'price_difference', 'neg', 'neu', 'pos', 'compound'],
      dtype='object')

In [277]:
df_clean.loc[df_clean.product_id=='B07JW9H4J1']

Unnamed: 0,product_id,product_name,about_product,category_1,category_2,discounted_price,actual_price,discount_percentage,price_difference,rating,...,user_name,review_id,review_title,review_content,neg,neu,pos,compound,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Manav,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Adarsh gupta,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
2,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Sundeep,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
3,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,S.Sayeed Ahmed,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
4,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,jaspreet singh,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
5,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Khaja moin,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
6,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Anand,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
7,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,S.ARUMUGAM,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
2834,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Manav,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/I/51UsScvHQN...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
2835,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,700.0,4.2,...,Adarsh gupta,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,0.032,0.744,0.224,0.9033,https://m.media-amazon.com/images/I/51UsScvHQN...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...


In [273]:
column_position = ['product_id','product_name','about_product','category_1','category_2','discounted_price','actual_price','discount_percentage', 
                   'price_difference','rating', 'rating_count', 'rating_score', 'neg','neu','pos','compound',
                   'user_id','user_name', 'img_link','product_link']
df_clean = df_clean[column_position]

In [141]:
# take only user with a number of rating at least rating_threshold
rating_threshold = 10 #only user with 10 ratings above will be fed to recommendation system
train_df = 

8886

In [9]:
product = df[['product_id', 'about_product']]

In [17]:
product = product.drop_duplicates()