In [408]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import cv2
import spacy
import string
from sklearn.metrics.pairwise import cosine_similarity

### Load data

In [409]:
link = 'data.xlsx'

In [410]:
data = pd.read_excel(link)
data

Unnamed: 0,name,ratings,price,corpus,Brand,color,Storage,RAM,Capacity,System,Processor,Size,Resolution
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,redmi,black,128.0,6.0,5000.0,android,mediatek,16.94,2400×1080
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,oppo,green,128.0,6.0,4000.0,android,mediatek,16.51,2340×1080
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,redmi,white,64.0,4.0,5000.0,android,other,16.33,2400×1080
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,oneplus,blue,256.0,12.0,4500.0,android,qualcomm,16.33,2400×1080
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,apple,blue,128.0,,,ios,apple,13.72,2340×1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,motorola,grey,128.0,6.0,5000.0,android,mediatek,16.64,2460×1080
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,samsung,other,128.0,8.0,4500.0,android,other,16.51,2400×1080
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,redmi,black,64.0,4.0,5020.0,android,mediatek,16.59,2340×1080
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,oneplus,black,128.0,8.0,4500.0,android,other,16.64,2400×1080


### Pre-processing data

In [411]:
# convert 'none' to NaN
data = data.replace('none', pd.NA)

In [412]:
# category Brand, color, System, Processor
# get dummy Brand
dummy = pd.get_dummies(data['Brand'], prefix='Brand')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Brand', axis=1)

# get dummy color
dummy = pd.get_dummies(data['color'], prefix='color')
data = pd.concat([data, dummy], axis=1)
data = data.drop('color', axis=1)

# get dummy system
dummy = pd.get_dummies(data['System'], prefix='System')
data = pd.concat([data, dummy], axis=1)
data = data.drop('System', axis=1)

# get dummy processor
dummy = pd.get_dummies(data['Processor'], prefix='Processor')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Processor', axis=1)

data = data*1
data

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Resolution,Brand_apple,...,color_yellow,System_android,System_ios,Processor_apple,Processor_intel,Processor_mediatek,Processor_other,Processor_qualcomm,Processor_samsung,Processor_unisoc
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,2400×1080,0,...,0,1,0,0,0,1,0,0,0,0
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,2340×1080,0,...,0,1,0,0,0,1,0,0,0,0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,2400×1080,0,...,0,1,0,0,0,0,0,1,0,0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,,,13.72,2340×1080,1,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,128.0,6.0,5000.0,16.64,2460×1080,0,...,0,1,0,0,0,1,0,0,0,0
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,128.0,8.0,4500.0,16.51,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,64.0,4.0,5020.0,16.59,2340×1080,0,...,0,1,0,0,0,1,0,0,0,0
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,128.0,8.0,4500.0,16.64,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0


In [413]:
# convert nan to 0
data = data.fillna(0)
data

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Resolution,Brand_apple,...,color_yellow,System_android,System_ios,Processor_apple,Processor_intel,Processor_mediatek,Processor_other,Processor_qualcomm,Processor_samsung,Processor_unisoc
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,2400×1080,0,...,0,1,0,0,0,1,0,0,0,0
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,2340×1080,0,...,0,1,0,0,0,1,0,0,0,0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,2400×1080,0,...,0,1,0,0,0,0,0,1,0,0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,0.0,0.0,13.72,2340×1080,1,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,128.0,6.0,5000.0,16.64,2460×1080,0,...,0,1,0,0,0,1,0,0,0,0
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,128.0,8.0,4500.0,16.51,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,64.0,4.0,5020.0,16.59,2340×1080,0,...,0,1,0,0,0,1,0,0,0,0
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,128.0,8.0,4500.0,16.64,2400×1080,0,...,0,1,0,0,0,0,1,0,0,0


In [414]:
data.describe()

Unnamed: 0,ratings,price,Storage,RAM,Capacity,Size,Brand_apple,Brand_greenberri,Brand_infinix,Brand_mi,...,color_yellow,System_android,System_ios,Processor_apple,Processor_intel,Processor_mediatek,Processor_other,Processor_qualcomm,Processor_samsung,Processor_unisoc
count,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,...,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0
mean,4.297763,16776.915819,130.343727,4.827192,4083.848312,16.038861,0.151281,0.000407,0.060187,0.02074,...,0.00488,0.824319,0.150468,0.151688,0.001627,0.293615,0.211468,0.230582,0.0549,0.030907
std,0.215715,8401.93975,118.402556,3.221055,2564.715608,1.940041,0.358395,0.020166,0.237881,0.142542,...,0.069701,0.380626,0.357602,0.358791,0.040307,0.45551,0.408433,0.421291,0.227832,0.173101
min,2.9,73.0,0.002,0.0,0.0,3.81,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.2,10499.0,64.0,3.0,4000.0,16.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.3,15300.0,128.0,4.0,5000.0,16.51,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.4,22990.0,128.0,8.0,5000.0,16.76,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,5.0,89999.0,1024.0,16.0,50000.0,50.8,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [415]:
# chia khoảng giá trị low, middle, high
data['price_1'] = pd.cut(data['price'], bins=[0, 10499, 15300, 22990, 89999], labels=['low', 'low-middle', 'high-middle', 'high'])
data['Storage_1'] = pd.cut(data['Storage'], bins=[0, 64, 128, 1024], labels=['low', 'middle', 'high'])
data['RAM_1'] = pd.cut(data['RAM'], bins=[0, 4, 8, 16], labels=['low', 'middle', 'high'])
data['Capacity_1'] = pd.cut(data['Capacity'], bins=[0, 4000, 5000, 50000], labels=['low', 'middle', 'high'])
data['Size_1'] = pd.cut(data['Size'], bins=[0, 16, 16.7, 50.8], labels=['low', 'middle', 'high'])
data

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Resolution,Brand_apple,...,Processor_mediatek,Processor_other,Processor_qualcomm,Processor_samsung,Processor_unisoc,price_1,Storage_1,RAM_1,Capacity_1,Size_1
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,2400×1080,0,...,1,0,0,0,0,high,middle,middle,middle,high
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,2340×1080,0,...,1,0,0,0,0,high-middle,middle,middle,low,middle
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,2400×1080,0,...,0,1,0,0,0,low-middle,low,low,middle,middle
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,2400×1080,0,...,0,0,1,0,0,high-middle,high,high,middle,middle
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,0.0,0.0,13.72,2340×1080,1,...,0,0,0,0,0,low,middle,,,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,128.0,6.0,5000.0,16.64,2460×1080,0,...,1,0,0,0,0,high-middle,middle,middle,middle,middle
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,128.0,8.0,4500.0,16.51,2400×1080,0,...,0,1,0,0,0,high,middle,middle,middle,middle
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,64.0,4.0,5020.0,16.59,2340×1080,0,...,1,0,0,0,0,low-middle,low,low,high,middle
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,128.0,8.0,4500.0,16.64,2400×1080,0,...,0,1,0,0,0,high,middle,middle,middle,middle


In [416]:
dummy = pd.get_dummies(data['price_1'], prefix='price_1')
data = pd.concat([data, dummy], axis=1)
data = data.drop('price_1', axis=1)

dummy = pd.get_dummies(data['Storage_1'], prefix='Storage_1')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Storage_1', axis=1)

dummy = pd.get_dummies(data['RAM_1'], prefix='RAM_1')
data = pd.concat([data, dummy], axis=1)
data = data.drop('RAM_1', axis=1)

dummy = pd.get_dummies(data['Capacity_1'], prefix='Capacity_1')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Capacity_1', axis=1)

dummy = pd.get_dummies(data['Size_1'], prefix='Size_1')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Size_1', axis=1)

In [417]:
data = data*1
data

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Resolution,Brand_apple,...,Storage_1_high,RAM_1_low,RAM_1_middle,RAM_1_high,Capacity_1_low,Capacity_1_middle,Capacity_1_high,Size_1_low,Size_1_middle,Size_1_high
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,2400×1080,0,...,0,0,1,0,0,1,0,0,0,1
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,2340×1080,0,...,0,0,1,0,1,0,0,0,1,0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,2400×1080,0,...,0,1,0,0,0,1,0,0,1,0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,2400×1080,0,...,1,0,0,1,0,1,0,0,1,0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,0.0,0.0,13.72,2340×1080,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,128.0,6.0,5000.0,16.64,2460×1080,0,...,0,0,1,0,0,1,0,0,1,0
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,128.0,8.0,4500.0,16.51,2400×1080,0,...,0,0,1,0,0,1,0,0,1,0
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,64.0,4.0,5020.0,16.59,2340×1080,0,...,0,1,0,0,0,0,1,0,1,0
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,128.0,8.0,4500.0,16.64,2400×1080,0,...,0,0,1,0,0,1,0,0,1,0


In [418]:
# tách dữ liệu cột Resolution thành 2 cột
data[['Resolution_1', 'Resolution_2']] = data['Resolution'].str.split('×', expand=True)
data['Resolution_1'] = data['Resolution_1'].astype(float)
data['Resolution_2'] = data['Resolution_2'].astype(float)
data['Resolution_3'] = data['Resolution_1'] * data['Resolution_2']
data.drop('Resolution', axis=1, inplace=True)
data

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Brand_apple,Brand_greenberri,...,RAM_1_high,Capacity_1_low,Capacity_1_middle,Capacity_1_high,Size_1_low,Size_1_middle,Size_1_high,Resolution_1,Resolution_2,Resolution_3
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,0,0,...,0,0,1,0,0,0,1,2400.0,1080.0,2592000.0
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,0,0,...,0,1,0,0,0,1,0,2340.0,1080.0,2527200.0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,0,0,...,0,0,1,0,0,1,0,2400.0,1080.0,2592000.0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,0,0,...,1,0,1,0,0,1,0,2400.0,1080.0,2592000.0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,0.0,0.0,13.72,1,0,...,0,0,0,0,1,0,0,2340.0,1080.0,2527200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,"MOTOROLA g72 (Meteorite Grey, 128 GB)",4.1,15999,Storage128 GBRAM6 GBExpandable Storage1TB Sys...,128.0,6.0,5000.0,16.64,0,0,...,0,0,1,0,0,1,0,2460.0,1080.0,2656800.0
2455,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,Storage128 GBRAM8 SystemAndroid 10Processor S...,128.0,8.0,4500.0,16.51,0,0,...,0,0,1,0,0,1,0,2400.0,1080.0,2592000.0
2456,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,64.0,4.0,5020.0,16.59,0,0,...,0,0,0,1,0,1,0,2340.0,1080.0,2527200.0
2457,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,Storage128 GBRAM8 SystemAndroid 11Processor S...,128.0,8.0,4500.0,16.64,0,0,...,0,0,1,0,0,1,0,2400.0,1080.0,2592000.0


In [419]:
data.describe()

Unnamed: 0,ratings,price,Storage,RAM,Capacity,Size,Brand_apple,Brand_greenberri,Brand_infinix,Brand_mi,...,RAM_1_high,Capacity_1_low,Capacity_1_middle,Capacity_1_high,Size_1_low,Size_1_middle,Size_1_high,Resolution_1,Resolution_2,Resolution_3
count,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,...,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2459.0,2295.0,2295.0,2295.0
mean,4.297763,16776.915819,130.343727,4.827192,4083.848312,16.038861,0.151281,0.000407,0.060187,0.02074,...,0.052054,0.169581,0.604311,0.093941,0.263115,0.390809,0.346076,1936.468845,1128.808279,2132955.0
std,0.215715,8401.93975,118.402556,3.221055,2564.715608,1.940041,0.358395,0.020166,0.237881,0.142542,...,0.22218,0.375341,0.489098,0.291805,0.440414,0.488031,0.475814,633.955106,433.011609,859873.5
min,2.9,73.0,0.002,0.0,0.0,3.81,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,128.0,16384.0
25%,4.2,10499.0,64.0,3.0,4000.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1560.0,750.0,1152000.0
50%,4.3,15300.0,128.0,4.0,5000.0,16.51,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2340.0,1080.0,2527200.0
75%,4.4,22990.0,128.0,8.0,5000.0,16.76,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2400.0,1125.0,2592000.0
max,5.0,89999.0,1024.0,16.0,50000.0,50.8,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3216.0,3088.0,4631040.0


In [420]:
data['Resolution_3'] = pd.cut(data['Resolution_3'], bins=[0, 1.152000e+06, 2.592000e+06, 4.631040e+06], labels=['low', 'middle', 'high'])
dummy = pd.get_dummies(data['Resolution_3'], prefix='Resolution_3')
data = pd.concat([data, dummy], axis=1)
data = data.drop('Resolution_3', axis=1)
data = data*1

In [421]:
data.head()

Unnamed: 0,name,ratings,price,corpus,Storage,RAM,Capacity,Size,Brand_apple,Brand_greenberri,...,Capacity_1_middle,Capacity_1_high,Size_1_low,Size_1_middle,Size_1_high,Resolution_1,Resolution_2,Resolution_3_low,Resolution_3_middle,Resolution_3_high
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,Storage128 GBRAM6 SystemAndroid 12Processor T...,128.0,6.0,5000.0,16.94,0,0,...,1,0,0,0,1,2400.0,1080.0,0,1,0
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,Storage128 GBRAM6 GBExpandable Storage256GB S...,128.0,6.0,4000.0,16.51,0,0,...,0,0,0,1,0,2340.0,1080.0,0,1,0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,64.0,4.0,5000.0,16.33,0,0,...,1,0,0,1,0,2400.0,1080.0,0,1,0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,Storage256 GBRAM12 SystemAndroid Q 11Processo...,256.0,12.0,4500.0,16.33,0,0,...,1,0,0,1,0,2400.0,1080.0,0,1,0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,Storage128 SystemiOS 15Processor TypeA15 Bion...,128.0,0.0,0.0,13.72,1,0,...,0,0,1,0,0,2340.0,1080.0,0,1,0


In [422]:
data = pd.read_csv('mobile_recommendation_system_dataset.csv')
data['price'] = data['price'].str.replace('₹', '')
data['price'] = data['price'].str.replace(',', '')
data.dropna(inplace=True)
data['ratings'].dropna(inplace=True)
data['price'] = pd.to_numeric(data['price'], errors='coerce') 
data['price_1'] = pd.cut(data['price'], bins=[0, 10499, 15300, 22990, 89999], labels=['low price', 'low middle price', 'high middle price', 'high price'])
data['ratings'] = pd.to_numeric(data['ratings'], errors='coerce')  
data['ratings_1'] = pd.cut(data['ratings'], bins=[0, 3.5, 4.5, 5], labels=['low rating', 'middle rating', 'high rating'])
data['corpus'] = data['corpus'].astype(str) + ' ' + data['price_1'].astype(str) + ' ' + data['ratings_1'].astype(str)
data.drop(['price_1', 'ratings_1'], axis=1, inplace=True)
data

Unnamed: 0,name,ratings,price,imgURL,corpus
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,23999,https://rukminim2.flixcart.com/image/312/312/x...,Storage128 GBRAM6 SystemAndroid 12Processor T...
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,20999,https://rukminim2.flixcart.com/image/312/312/k...,Storage128 GBRAM6 GBExpandable Storage256GB S...
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,13149,https://rukminim2.flixcart.com/image/312/312/x...,Storage64 GBRAM4 SystemAndroid 11Processor Sp...
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,21999,https://rukminim2.flixcart.com/image/312/312/x...,Storage256 GBRAM12 SystemAndroid Q 11Processo...
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,3537,https://rukminim2.flixcart.com/image/312/312/k...,Storage128 SystemiOS 15Processor TypeA15 Bion...
...,...,...,...,...,...
2540,"SAMSUNG Galaxy S20 FE 5G (Cloud Navy, 128 GB)",4.2,27440,https://rukminim2.flixcart.com/image/312/312/x...,Storage128 GBRAM8 SystemAndroid 10Processor S...
2541,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,11999,https://rukminim2.flixcart.com/image/312/312/k...,Storage64 GBRAM4 GBExpandable Storage512GB Sy...
2542,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,30203,https://rukminim2.flixcart.com/image/312/312/x...,Storage128 GBRAM8 SystemAndroid 11Processor S...
2544,"SAMSUNG Galaxy S22 Ultra 5G (Phantom Black, 25...",4.3,20463,https://rukminim2.flixcart.com/image/312/312/x...,Storage256 GBRAM12 SystemAndroid 12Processor ...


In [423]:
data['corpus'][0]

'Storage128 GBRAM6  SystemAndroid 12Processor TypeMediatek Dimensity 1080Processor Speed2.6 50MP 8MP 2MP 50MP 16MP 5G Capacity5000 Display Size16.94 cm (6.67 inch)Resolution2400 x 1080 PixelsResolution TypeFull HD+ AMOLED DisplayGPUARM Mali-G68 MC4Display TypeFull HD+ AMOLED DisplayOther Display FeaturesRefresh Rate: 120 Hz, Contrast: 5,000,000:1, 1920Hz PWM Dimming, Brightness Level: 16000:1, Peak Brightness: 900 nits high price middle rating'

In [467]:
def text_proc(text):
    nlp = spacy.load('en_core_web_sm')
    text = ''.join([char for char in text if char not in string.punctuation])
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    garbage = ['(',')' , ' ',',',':','\n']
    tokens = [token for token in tokens if not nlp.vocab[token].is_stop and token not in garbage]

    return ' '.join(tokens)

def preprocess_data(data, is_train=True):
    if is_train:
        data = data.reset_index(drop=True)
        data['price'] = data['price'].str.replace('₹', '')
        data['price'] = data['price'].str.replace(',', '')
        data.dropna(inplace=True)
        data['ratings'].dropna(inplace=True)
        data['price'] = pd.to_numeric(data['price'], errors='coerce') 
        data['price_1'] = pd.cut(data['price'], bins=[0, 10499, 15300, 22990, 89999], labels=['low price', 'low middle price', 'high middle price', 'high price'])
        data['ratings'] = pd.to_numeric(data['ratings'], errors='coerce')  
        data['ratings_1'] = pd.cut(data['ratings'], bins=[0, 3.5, 4.5, 5], labels=['low rating', 'middle rating', 'high rating'])
        data['corpus'] = data['corpus'].astype(str) + ' ' + data['price_1'].astype(str) + ' ' + data['ratings_1'].astype(str)
        data.drop(['price_1', 'ratings_1'], axis=1, inplace=True)
        data['corpus'] = data['corpus'].apply(text_proc)
    else:
        data['corpus'] = data['corpus'].apply(text_proc)
    
    return data

def compute_features(data, vectorizer, kmeans, is_train=True):
    # Sử dụng TF-IDF hoặc word embeddings cho text
    if is_train:
        tfidf_matrix = vectorizer.fit_transform(data['corpus'])
        features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
        features['cluster'] = kmeans.fit_predict(features).astype(float)
        labels = features['cluster']
        data['cluster'] = labels
        
    else:
        tfidf_matrix = vectorizer.transform(data['corpus'])
        features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
        features['cluster'] = kmeans.predict(features).astype(float)
        return features
        
    return features, labels, data

def build_model():
    # Sử dụng một mô hình machine learning để học từ đặc trưng
    model = RandomForestClassifier()
    return model

def make_recommendations(data, model, vectorizer, kmeans, user_input):
    # chuyển user_input thành dataframe
    user_input = pd.DataFrame([user_input], columns=['corpus'])
    user_input = preprocess_data(user_input, is_train=False)
    features = compute_features(user_input, vectorizer, kmeans, is_train=False)
    recommendations = model.predict(features)
    
    # Lọc ra thông tin của các sản phẩm được recommend
    recommended_products = data.loc[data['cluster'].isin(recommendations)]
    
    # Sắp xếp và lấy top 10 sản phẩm (giả sử dựa trên cột 'rating')
    top_10_products = recommended_products.sort_values(by='ratings', ascending=False).head(10)
    
    return top_10_products

In [425]:
# Chạy toàn bộ quy trình
data = pd.read_csv('mobile_recommendation_system_dataset.csv')
processed_data = preprocess_data(data)

In [462]:
tfidf = TfidfVectorizer()
kmeans = KMeans(n_clusters=10)
features, labels, data = compute_features(processed_data.dropna(), vectorizer=tfidf, kmeans=kmeans)
data.dropna(inplace=True)
model = build_model()

In [463]:
model.fit(features, labels)

In [473]:
# recommend
user_input = 'low rating low price'
recommendations = make_recommendations(data, model, vectorizer=tfidf, kmeans=kmeans, user_input=user_input)
recommendations

Unnamed: 0,name,ratings,price,imgURL,corpus,cluster
1151,"APPLE iPhone 12 Pro Max (Pacific Blue, 256 GB)",5.0,26273,https://rukminim2.flixcart.com/image/312/312/k...,storage256 systemio 14processor typea14 bionic...,9.0
489,"SAMSUNG Galaxy A12 (Blue, 128 GB)",4.8,14999,https://rukminim2.flixcart.com/image/312/312/k...,storage128 gbram4 gbexpandable storage1 tb sys...,9.0
1191,"APPLE iPhone 14 Pro Max (Deep Purple, 512 GB)",4.7,26927,https://rukminim2.flixcart.com/image/312/312/x...,storage512 systemio 16processor typea16 bionic...,9.0
952,"APPLE iPhone 13 Pro (Graphite, 256 GB)",4.7,1172,https://rukminim2.flixcart.com/image/312/312/k...,storage256 systemio 15processor typea15 bionic...,9.0
2014,"APPLE iPhone 11 Pro Max (Space Grey, 512 GB)",4.7,19728,https://rukminim2.flixcart.com/image/312/312/k...,storage512 systemio 13processor typea13 bionic...,9.0
895,"APPLE iPhone 14 Pro (Silver, 512 GB)",4.7,19927,https://rukminim2.flixcart.com/image/312/312/x...,storage512 systemio 16processor typea16 bionic...,9.0
950,"APPLE iPhone 13 Pro (Alpine Green, 1 TB)",4.7,13927,https://rukminim2.flixcart.com/image/312/312/l...,storage1024 systemio 15processor typea15 bioni...,9.0
1883,"APPLE iPhone 14 Pro (Space Black, 256 GB)",4.7,73,https://rukminim2.flixcart.com/image/312/312/x...,storage256 systemio 16processor typea16 bionic...,9.0
1487,"APPLE iPhone 12 (Green, 64 GB)",4.6,11537,https://rukminim2.flixcart.com/image/312/312/k...,storage64 systemio 14processor typea14 bionic ...,9.0
393,"APPLE iPhone 11 Pro Max (Space Grey, 64 GB)",4.6,30163,https://rukminim2.flixcart.com/image/312/312/k...,storage64 systemio 13processor typea13 bionic ...,9.0
