## **Task 4: Analyze data**

<hr/>

In [1]:
# Import libraries
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

### *1. Read preprocessing data from mongoDB Local*

In [2]:
client_Tlocal = MongoClient('mongodb://localhost:27017/')
db_Tlocal = client_Tlocal ['Task2_Database']
collection_Tlocal = db_Tlocal['Shopee_Full_Preprocessing_Data']
num_documents_Tlocal = collection_Tlocal.count_documents({})
print("Number of documents in the collection:", num_documents_Tlocal)

Number of documents in the collection: 549791


In [3]:
# Read preprocessing data from Thinh's MongoDB Local
def read_preprocessing_data(collection, num_documents):
    data_list = []
    total_documents = num_documents
    with tqdm(total=total_documents, desc='Getting Documents') as pbar:
        for document in collection.find({"crawl_id": {"$gte": 0, "$lt": total_documents}}):
            del document['_id']
            data_list.append(document)
            pbar.update(1)    

    df = pd.DataFrame(data_list)
    return df

In [4]:
df = read_preprocessing_data(collection_Tlocal, num_documents_Tlocal)

Getting Documents:   0%|          | 0/549791 [00:00<?, ?it/s]

Getting Documents: 100%|██████████| 549791/549791 [01:11<00:00, 7677.78it/s] 


In [5]:
df

Unnamed: 0,itemid,shopid,name,stock,sold,historical_sold,liked_count,cmt_count,item_status,price,...,global_sold_count,flash_sale_stock,crawl_time,crawl_id,color,size,rating_star,rating_count,rcount_with_image,rcount_with_context
0,19393753758,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",159,161,163,41,40,normal,285000.0,...,163,159,2023-11-02 12:00:00,0,"ĐEN, TIÊU","[M, L, XL, XXL]",4.925000,"[40, 0, 0, 0, 3, 37]",16,21
1,4263018116,40342563,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,240,3098,39098,5112,12119,normal,185000.0,...,39098,240,2023-11-02 12:00:00,1,Ghi,"[M, L, XL, XXL]",4.845551,"[12120, 119, 86, 265, 616, 11034]",5409,6928
2,18484638101,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",222,439,462,77,136,normal,185000.0,...,462,222,2023-11-02 12:00:00,2,"ĐEN, TRẮNG","[M, L, XL, XXL]",4.897059,"[136, 2, 1, 0, 3, 130]",58,74
3,23081308025,40342563,"Áo Khoác Nam AVIANO Cao Cấp Chống Nước,Chống G...",216,387,397,107,118,normal,229000.0,...,397,216,2023-11-02 12:00:00,3,ĐEN,"[M, L, XL, XXL]",4.872881,"[118, 1, 2, 1, 3, 111]",45,59
4,18040585153,40342563,Bộ Quần Áo Cho Bé Thiết Kế Kẻ Ô Cao Cấp Thời T...,317,26,227,35,87,normal,99000.0,...,227,0,2023-11-02 12:00:00,4,Nâu,"[2, 3, 4, 5, 6]",4.875000,"[88, 1, 0, 2, 3, 82]",45,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549786,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,70,0,3,0,1,normal,21000.0,...,3,0,2023-12-05 23:20:53,549786,XANH,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,0
549787,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,75,0,14,1,1,normal,25000.0,...,14,0,2023-12-05 23:20:53,549787,XANH,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,0
549788,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,60,0,22,2,1,normal,50000.0,...,22,0,2023-12-05 23:20:53,549788,ĐEN,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,1
549789,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,79,0,11,4,4,normal,70000.0,...,11,0,2023-12-05 23:20:53,549789,"ĐEN, XÁM",[No size],5.000000,"[4, 0, 0, 0, 0, 4]",0,2


### *2. Read raw data from mongoDB Atlas*

In [6]:
# MONGODB_URI = "mongodb+srv://MasterMind:MasterMind@productpage.nj9srv4.mongodb.net/"
# client_SAtlas = MongoClient(MONGODB_URI, server_api=ServerApi('1'))

# # Send a ping to confirm a successful connection
# try:
#     client_SAtlas.admin.command('ping')
#     print("Pinged your deployment. You successfully connected to MongoDB!")
# except Exception as e:
#     print(e)
# db_SAtlas = client_SAtlas.get_database('product_pages')
# records = db_SAtlas.items

#### *2.1. Preprocessing data in Son MongoDB Atlas*

In [7]:
# cursor = records.find({})

In [8]:
# data_list = []
# for document in cursor:
#             del document['_id']
#             data_list.append(document)
            
# raw_df = pd.DataFrame(data_list)


In [9]:
# json_content = pd.DataFrame(raw_df['items'].apply(pd.Series))
# item_infor = pd.DataFrame(json_content['item'].apply(pd.Series))
# description_df = pd.DataFrame(item_infor[['item_id','shop_id', 'title', 'description']])
# description_df.to_csv('sample.csv',index=False,header=True,encoding="utf-8-sig")

### *3. Analyze keyword from title of products*

In [10]:
raw_df = df.copy()
raw_df

Unnamed: 0,itemid,shopid,name,stock,sold,historical_sold,liked_count,cmt_count,item_status,price,...,global_sold_count,flash_sale_stock,crawl_time,crawl_id,color,size,rating_star,rating_count,rcount_with_image,rcount_with_context
0,19393753758,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",159,161,163,41,40,normal,285000.0,...,163,159,2023-11-02 12:00:00,0,"ĐEN, TIÊU","[M, L, XL, XXL]",4.925000,"[40, 0, 0, 0, 3, 37]",16,21
1,4263018116,40342563,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,240,3098,39098,5112,12119,normal,185000.0,...,39098,240,2023-11-02 12:00:00,1,Ghi,"[M, L, XL, XXL]",4.845551,"[12120, 119, 86, 265, 616, 11034]",5409,6928
2,18484638101,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",222,439,462,77,136,normal,185000.0,...,462,222,2023-11-02 12:00:00,2,"ĐEN, TRẮNG","[M, L, XL, XXL]",4.897059,"[136, 2, 1, 0, 3, 130]",58,74
3,23081308025,40342563,"Áo Khoác Nam AVIANO Cao Cấp Chống Nước,Chống G...",216,387,397,107,118,normal,229000.0,...,397,216,2023-11-02 12:00:00,3,ĐEN,"[M, L, XL, XXL]",4.872881,"[118, 1, 2, 1, 3, 111]",45,59
4,18040585153,40342563,Bộ Quần Áo Cho Bé Thiết Kế Kẻ Ô Cao Cấp Thời T...,317,26,227,35,87,normal,99000.0,...,227,0,2023-11-02 12:00:00,4,Nâu,"[2, 3, 4, 5, 6]",4.875000,"[88, 1, 0, 2, 3, 82]",45,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549786,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,70,0,3,0,1,normal,21000.0,...,3,0,2023-12-05 23:20:53,549786,XANH,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,0
549787,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,75,0,14,1,1,normal,25000.0,...,14,0,2023-12-05 23:20:53,549787,XANH,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,0
549788,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,60,0,22,2,1,normal,50000.0,...,22,0,2023-12-05 23:20:53,549788,ĐEN,[No size],5.000000,"[1, 0, 0, 0, 0, 1]",0,1
549789,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,79,0,11,4,4,normal,70000.0,...,11,0,2023-12-05 23:20:53,549789,"ĐEN, XÁM",[No size],5.000000,"[4, 0, 0, 0, 0, 4]",0,2


#### 3.1. Filtering top 50 products from top shop mall and normal shop:

In [11]:
top_shopee_mall_id = [40342563, 225909574, 68613764, 24710134, 127217331, 60297616, 111639450,\
                      317477677, 201774917, 59596762, 168678363, 92937520, 68988783, 263713672,\
                      38038824, 277366270, 70677296, 1620236, 17893078, 31522834, 16649961,\
                      296132807, 257412160]
normal_shop_id = [999122826, 952153869, 47401874, 80968732,744060250,127049276,324062232,205125399,\
                  427693365, 360972139, 168195778, 913496573, 299977840, 183199642, 416018485,\
                  264542552, 78461361, 329002439, 108136164, 75123404]

In [12]:
raw_df = raw_df[['itemid','shopid','name','sold','price','rating_star']]
raw_df 

Unnamed: 0,itemid,shopid,name,sold,price,rating_star
0,19393753758,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",161,285000.0,4.925000
1,4263018116,40342563,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,3098,185000.0,4.845551
2,18484638101,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",439,185000.0,4.897059
3,23081308025,40342563,"Áo Khoác Nam AVIANO Cao Cấp Chống Nước,Chống G...",387,229000.0,4.872881
4,18040585153,40342563,Bộ Quần Áo Cho Bé Thiết Kế Kẻ Ô Cao Cấp Thời T...,26,99000.0,4.875000
...,...,...,...,...,...,...
549786,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,0,21000.0,5.000000
549787,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,0,25000.0,5.000000
549788,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,0,50000.0,5.000000
549789,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,0,70000.0,5.000000


In [13]:
# Remove duplicated data and keep the last value
analyzed_df = raw_df.drop_duplicates(subset='itemid', keep='last')
analyzed_df.reset_index(drop=True, inplace=True)
analyzed_df

Unnamed: 0,itemid,shopid,name,sold,price,rating_star
0,16985661061,317477677,Quần Jean LEVENTS Crayon/ Blue,5,413000.0,4.962264
1,6944158731,92937520,Áo sơ mi dài tay nam Owen Regularfit- AR20728DT,45,598000.0,4.881356
2,15494176620,92937520,OWEN - Quần Boxer nam màu Xanh - QLBR221094,1,120000.0,0.000000
3,14498861181,92937520,OWEN - Áo sơ mi Nam dài tay Slim Fit màu Navy ...,0,1200000.0,0.000000
4,23680547799,296132807,Quần Đùi Nam SAIGONESE Short Thun Phối Line Th...,4,119000.0,5.000000
...,...,...,...,...,...,...
9307,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,0,21000.0,5.000000
9308,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,0,25000.0,5.000000
9309,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,0,50000.0,5.000000
9310,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,0,70000.0,5.000000


In [14]:
# SM: Shop Mall
# NM: Normal Shop
top_50_items_SM_df = pd.DataFrame(columns=['shopid','name'])
top_50_items_NM_df = pd.DataFrame(columns=['shopid','name'])

In [15]:
# Get 50 products from each Shop Mall (SM)
for i in tqdm(range(0,len(top_shopee_mall_id))):
    shopid = top_shopee_mall_id[i]
    all_items = analyzed_df[analyzed_df['shopid'] == str(shopid)][['name','sold']].sort_values(by='sold',ascending=False)
    try:
        top_50_SM_items = list(all_items['name'].iloc[:50])
    except:
        top_50_SM_items = list(all_items['name'])
    shopid_list = [shopid] * len(top_50_SM_items)
    result = pd.DataFrame({'shopid': shopid_list, 'name': top_50_SM_items})
    top_50_items_SM_df = pd.concat([top_50_items_SM_df, result], ignore_index=True)

 48%|████▊     | 11/23 [00:00<00:00, 108.89it/s]

100%|██████████| 23/23 [00:00<00:00, 110.12it/s]


In [16]:
top_50_items_SM_df

Unnamed: 0,shopid,name
0,40342563,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...
1,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na..."
2,40342563,COMBO 5 đôi tất nam Trơn VICERO Kháng khuẩn
3,40342563,Áo Nỉ Nam Thời Trang Trẻ Trung Chất Vải Co Dãn...
4,40342563,Bộ Thể Thao Nam 3 Sọc Mùa Đông Da Cá Chéo Thời...
...,...,...
1133,257412160,"Áo thun nam cổ tròn, T-shirt nam đen in AREMI,..."
1134,257412160,Áo polo nam tay ngắn cổ trụ AREMI thiết kế độc...
1135,257412160,Áo thun nam tay ngắn cổ tròn thiết kế AREMI ch...
1136,257412160,Áo polo nam cổ trụ tay ngắn AREMI thiết kế đơn...


In [17]:
# Get 50 products from each Normal Shop (NM)
for i in tqdm(range(0,len(normal_shop_id))):
    shopid = normal_shop_id[i]
    all_items = analyzed_df[analyzed_df['shopid'] == str(shopid)][['name','sold']].sort_values(by='sold',ascending=False)
    try:
        top_50_NM_items = list(all_items['name'].iloc[:50])
    except:
        top_50_NM_items = list(all_items['name'])
    shopid_list = [shopid] * len(top_50_NM_items)
    result = pd.DataFrame({'shopid': shopid_list, 'name': top_50_NM_items})
    top_50_items_NM_df = pd.concat([top_50_items_NM_df, result], ignore_index=True)

100%|██████████| 20/20 [00:00<00:00, 103.53it/s]


In [18]:
top_50_items_NM_df

Unnamed: 0,shopid,name
0,999122826,Hàng Có Sẵn! Hàng Có Sẵn!Phong cách dân tộc mù...
1,999122826,Hàng Có Sẵn! Hàng Có Sẵn!Shaoye Quần Váy Dài D...
2,999122826,Hàng sẵn sàng!Nam giới kích thước lớn phong cá...
3,999122826,Hàng Có Sẵn!Quần Giả Váy Lưng Thun Ống Rộng Th...
4,999122826,Hàng Có Sẵn!Set Đồ Thể Thao Phối Áo Ba Lỗ Vải ...
...,...,...
905,75123404,Vớ Thể Thao Cổ Trung Dành Cho Nam Thương Hiệu ...
906,75123404,Vớ Trẻ Em Cổ Ngắn 2-3 Tuổi Thương Hiệu ELLE EB...
907,75123404,Set 4 Đôi Vớ Công Sở Cổ Trung Dành Cho Nam Thư...
908,75123404,Vớ Công Sở Cổ Trung Dành Cho Nam Thương Hiệu E...


#### 3.2. Tokenize k-word in each products:

In [19]:
def tokenize_k_word(documents, k, stopwords):
    preprocessed_documents = [doc.lower() for doc in documents]
    vectorizer = CountVectorizer(ngram_range= (k,k), stop_words= stopwords)
    word_counts = vectorizer.fit_transform(preprocessed_documents)
    words = vectorizer.get_feature_names_out()
    word_occurrences = word_counts.toarray().sum(axis=0)
    word_count_dict = dict(zip(words, word_occurrences))
    Word = []
    Count = []
    for word, count in word_count_dict.items():
        Word.append(word)
        Count.append(count)
    return Word, Count

#### TOP SHOP MALL

##### **k = 1**

In [20]:
# documents = list(top_50_items_SM_df['name'])
# stopwords = ['nam', 'áo', 'quần', 'vải']
# # stopwords = None
# Word, Count = tokenize_k_word(documents, 1, stopwords)
# infor_1w_df = pd.DataFrame({'Word': Word, 'Count': Count})
# infor_best_1w_df = infor_1w_df.sort_values(by='Count',ascending=False)[:50]
# infor_best_1w_df.reset_index(drop=True, inplace=True)
# infor_best_1w_df

Ban đầu, khi chạy dòng lệnh trên thì nhóm em thấy có rất nhiều từ mang tính thời trang (Tức là những từ gần như là phải có trong mọi sản phẩm), thông tin này không hề có giá trị nên nhóm em đã đưa nó vào nhóm stopwords, tức là những từ ngữ không ý nghĩa 

##### **k = 2**

In [21]:
# # documents = list(top_50_items_SM_df['name'])
# documents = list(analyzed_df['name'])
# # stopwords = ['nam', 'áo', 'quần', 'vải']
# stopwords = None
# Word, Count = tokenize_k_word(documents, 2, stopwords)
# infor_2w_df = pd.DataFrame({'Word': Word, 'Count': Count})
# infor_best_2w_df = infor_2w_df.sort_values(by='Count',ascending=False)[:50]
# infor_best_2w_df.reset_index(drop=True, inplace=True)
# infor_best_2w_df

In [22]:
# documents = list(top_50_items_SM_df['name'])
# # stopwords = ['nam', 'áo', 'quần', 'vải']
# stopwords = None
# Word, Count = tokenize_k_word(documents, 3, stopwords)
# infor_3w_df = pd.DataFrame({'Word': Word, 'Count': Count})
# infor_best_3w_df = infor_3w_df.sort_values(by='Count',ascending=False)[:50]
# infor_best_3w_df.reset_index(drop=True, inplace=True)
# infor_best_3w_df

# ...

#### NORMAL SHOP

In [23]:
# documents = list(top_50_items_NM_df['name'])
# # stopwords = ['nam', 'áo', 'quần', 'vải']
# stopwords = None
# Word, Count = tokenize_k_word(documents, 1, stopwords)
# infor_1w_df = pd.DataFrame({'Word': Word, 'Count': Count})
# infor_best_1w_df = infor_1w_df.sort_values(by='Count',ascending=False)[:50]
# infor_best_1w_df.reset_index(drop=True, inplace=True)
# infor_best_1w_df

In [24]:
# documents = list(top_50_items_NM_df['name'])
# # stopwords = ['nam', 'áo', 'quần', 'vải']
# stopwords = None
# Word, Count = tokenize_k_word(documents, 2, stopwords)
# infor_2w_df = pd.DataFrame({'Word': Word, 'Count': Count})
# infor_best_2w_df = infor_2w_df.sort_values(by='Count',ascending=False)[:50]
# infor_best_2w_df.reset_index(drop=True, inplace=True)
# infor_best_2w_df

### 3.3 Form dataframe top popular keyword

In [25]:
analyzed_df

Unnamed: 0,itemid,shopid,name,sold,price,rating_star
0,16985661061,317477677,Quần Jean LEVENTS Crayon/ Blue,5,413000.0,4.962264
1,6944158731,92937520,Áo sơ mi dài tay nam Owen Regularfit- AR20728DT,45,598000.0,4.881356
2,15494176620,92937520,OWEN - Quần Boxer nam màu Xanh - QLBR221094,1,120000.0,0.000000
3,14498861181,92937520,OWEN - Áo sơ mi Nam dài tay Slim Fit màu Navy ...,0,1200000.0,0.000000
4,23680547799,296132807,Quần Đùi Nam SAIGONESE Short Thun Phối Line Th...,4,119000.0,5.000000
...,...,...,...,...,...,...
9307,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,0,21000.0,5.000000
9308,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,0,25000.0,5.000000
9309,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,0,50000.0,5.000000
9310,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,0,70000.0,5.000000


In [26]:
analyzed_df.loc[:, 'approximate_revenue'] = analyzed_df['sold'] * analyzed_df['price']
analyzed_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analyzed_df.loc[:, 'approximate_revenue'] = analyzed_df['sold'] * analyzed_df['price']


Unnamed: 0,itemid,shopid,name,sold,price,rating_star,approximate_revenue
0,16985661061,317477677,Quần Jean LEVENTS Crayon/ Blue,5,413000.0,4.962264,2065000.0
1,6944158731,92937520,Áo sơ mi dài tay nam Owen Regularfit- AR20728DT,45,598000.0,4.881356,26910000.0
2,15494176620,92937520,OWEN - Quần Boxer nam màu Xanh - QLBR221094,1,120000.0,0.000000,120000.0
3,14498861181,92937520,OWEN - Áo sơ mi Nam dài tay Slim Fit màu Navy ...,0,1200000.0,0.000000,0.0
4,23680547799,296132807,Quần Đùi Nam SAIGONESE Short Thun Phối Line Th...,4,119000.0,5.000000,476000.0
...,...,...,...,...,...,...,...
9307,1258757931,75123404,Vớ Cổ Trung Dành Cho Trẻ Em 12-14 Tuổi Thương ...,0,21000.0,5.000000,0.0
9308,1258817062,75123404,Vớ Trẻ Em Cổ Trung 3-5 Tuổi EBK170,0,25000.0,5.000000,0.0
9309,1267439667,75123404,Vớ Không Cổ Dành Cho Nam Thương Hiệu ELLE EH L...,0,50000.0,5.000000,0.0
9310,1267555891,75123404,Vớ Thể Thao Cổ Trung Dệt Xù Dành Cho Nam Thươn...,0,70000.0,5.000000,0.0


In [27]:
test_df = analyzed_df[analyzed_df['shopid'] == '40342563']
search_string = 'cao cấp'
# Hiển thị các hàng có chứa chuỗi cụ thể
t = test_df.sort_values(by='sold',ascending=False).reset_index(drop=True)
t.reset_index(inplace=True)
t

Unnamed: 0,index,itemid,shopid,name,sold,price,rating_star,approximate_revenue
0,0,4263018116,40342563,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,6595,185000.0,4.846812,1.220075e+09
1,1,18484638101,40342563,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",3912,195000.0,4.847720,7.628400e+08
2,2,1679232406,40342563,COMBO 5 đôi tất nam Trơn VICERO Kháng khuẩn,2731,60000.0,4.791030,1.638600e+08
3,3,4649597158,40342563,Áo Nỉ Nam Thời Trang Trẻ Trung Chất Vải Co Dãn...,2704,129000.0,4.882612,3.488160e+08
4,4,19749576955,40342563,Bộ Thể Thao Nam 3 Sọc Mùa Đông Da Cá Chéo Thời...,1833,275000.0,4.874379,5.040750e+08
...,...,...,...,...,...,...,...,...
186,186,16425906633,40342563,Áo Polo AVIANO Tay Ngắn 2 Màu Xanh Trắng Bo Cổ...,0,139000.0,4.910995,0.000000e+00
187,187,13681855901,40342563,Áo Polo Ngắn Tay 5 Màu Thời Trang Cao Cấp Thiế...,0,99000.0,4.979522,0.000000e+00
188,188,18588438649,40342563,Áo Polo Nam AVIANO Chất Liệu Co Giãn Thấm Hút ...,0,139000.0,5.000000,0.000000e+00
189,189,12073629517,40342563,"Bộ Đồ Nam AVIANO, Set Đồ Nam Polo Tay Ngắn Mix...",0,199000.0,4.924906,0.000000e+00


In [28]:
documents = list(analyzed_df['name']) 
stopwords = None
Word, Count = tokenize_k_word(documents, 1, stopwords)
infor_1w_df = pd.DataFrame({'Word': Word, 'Count': Count})
infor_best_1w_df = infor_1w_df.sort_values(by='Count',ascending=False)[:900]
infor_best_1w_df.reset_index(drop=True, inplace=True)
infor_best_1w_df

Unnamed: 0,Word,Count
0,nam,6529
1,áo,5535
2,quần,2824
3,tay,2588
4,hàng,2382
...,...,...
895,nguyên,10
896,2101,10
897,ngoại,10
898,ma,10


In [29]:
top_shopee_mall_id = list(map(str, top_shopee_mall_id))
normal_shop_id = list(map(str, normal_shop_id))
list_total_sold = []
list_rating_average = []
list_arevenue = []
list_n_shop_mall = []
list_sm_ra = []
list_sm_sold = []
list_sm_arevenue = []
list_n_normal_shop = []
list_ns_ra = []
list_ns_sold = []
list_ns_arevenue = []
for keyword in infor_best_1w_df['Word']: 
    # Kiểm tra xem các giá trị trong cột 'Column' có chứa chuỗi cụ thể hay không
    contains_search_string = analyzed_df['name'].str.lower().str.contains(keyword)
    total_sold = analyzed_df[contains_search_string]['sold'].sum()
    list_total_sold.append(total_sold)
    rating_average = analyzed_df[contains_search_string]['rating_star'].mean()
    list_rating_average.append(rating_average)
    arevenue = analyzed_df[contains_search_string]['approximate_revenue'].sum()
    list_arevenue.append(arevenue)
    # SM
    n_shop_mall = analyzed_df[contains_search_string]['shopid'].isin(top_shopee_mall_id).sum()
    list_n_shop_mall.append(n_shop_mall)
    sm_ra = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['rating_star'].mean()
    list_sm_ra.append(sm_ra)
    sm_sold = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['sold'].sum()
    list_sm_sold.append(sm_sold)
    sm_arevenue = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['approximate_revenue'].sum()
    list_sm_arevenue.append(sm_arevenue)
    # NS
    n_normal_shop = analyzed_df[contains_search_string]['shopid'].isin(normal_shop_id).sum()
    list_n_normal_shop.append(n_normal_shop)
    ns_ra = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['rating_star'].mean()
    list_ns_ra.append(ns_ra)
    ns_sold = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['sold'].sum()
    list_ns_sold.append(ns_sold)
    ns_arevenue = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['approximate_revenue'].sum()
    list_ns_arevenue.append(ns_arevenue)

In [30]:
infor_1w_df = infor_best_1w_df.copy()
infor_1w_df['Total Sold'] = list_total_sold
infor_1w_df['Approximate Revenue'] = list_arevenue
infor_1w_df['Rating Average'] = list_rating_average
infor_1w_df['Shopee Mall'] = list_n_shop_mall
infor_1w_df['SM Rating Average'] = list_sm_ra
infor_1w_df['SM Approximate Revenue'] = list_sm_arevenue
infor_1w_df['SM Sold'] = list_sm_sold
infor_1w_df['Normal Shop'] = list_n_normal_shop
infor_1w_df['NS Rating Average'] = list_ns_ra
infor_1w_df['NS Approximate Revenue'] = list_ns_arevenue
infor_1w_df['NS Sold'] = list_ns_sold
infor_1w_df['Count'] = infor_1w_df['Shopee Mall'] + infor_1w_df['Normal Shop']

In [31]:
infor_1w_df = infor_1w_df[infor_1w_df['Count'] != 0]
len(infor_1w_df)

900

In [32]:
stopwords_1w = ["áo","quần","hàng","trang","thời","cao","chất","có","cho","sơ", "mi", "sẵn", "dài", "cấp", "cách", \
                "phong","dáng", "khoác", "quốc", "hàn", "thể", "thao", "short", "co", "hình", "trung","ng","ch","th","hà",\
                     "ca", "to", "anh", "or", "cá", "ong", "bo", "in", "ro", "ngắn","li","ba", "24","ông","su","20", "10",\
                         "hong", "kho", "ma", "02", "các", "ve", "run", "hiệu", "hương", "thương", "độ", "động", "ẩn", "mềm",\
                        "gu", "hán", "kháng", "lót", "thoải", "mái","khuẩn", "phố", "phối", "ống", "đô","01","gió", "khoá", "liệu",\
                            "me", "tiết"] 

In [33]:
infor_1w_df = infor_1w_df.drop(infor_1w_df[infor_1w_df['Word'].isin(stopwords_1w)].index)
infor_1w_df.reset_index(drop=True, inplace=True)

In [34]:
infor_1w_df.sort_values(by='Count',ascending=False)[:20]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
0,nam,5936,214364,36173710000.0,2.94218,2639,4.486537,35778760000.0,212723,3297,1.70604,394954087.0,1641
1,tay,2551,51965,9305763000.0,2.827932,1291,4.418483,9198045000.0,51572,1260,1.198249,107718529.0,393
31,thu,1842,78120,14255780000.0,2.608748,744,4.624849,14176700000.0,77605,1098,1.242647,79076934.0,515
2,màu,1797,52031,8831446000.0,3.146235,976,4.249898,8758880000.0,51677,821,1.834206,72565764.0,354
3,thun,1422,71352,12167360000.0,3.199549,709,4.634148,12153020000.0,71162,713,1.772999,14338344.0,190
4,cổ,1374,52693,6104029000.0,3.035987,687,4.584183,5992547000.0,52178,687,1.487792,111481998.0,515
5,vải,1316,58831,8971833000.0,3.58661,686,4.494914,8878598000.0,58353,630,2.597568,93234760.0,478
12,fit,1147,8799,2862538000.0,3.889511,995,4.287464,2853248000.0,8766,152,1.284488,9289509.0,33
6,owen,970,3441,1806392000.0,4.14201,970,4.14201,1806392000.0,3441,0,,0.0,0
7,cotton,933,45184,8910313000.0,3.37055,598,4.618639,8902878000.0,45139,335,1.142618,7435409.0,45


In [35]:
# infor_1w_df.to_csv('Infor_1w.csv',index=False,header=True,encoding="utf-8-sig")

In [36]:
# documents = list(top_50_items_SM_df['name'])
documents = list(analyzed_df['name']) 
stopwords = None
Word, Count = tokenize_k_word(documents, 2, stopwords)
infor_2w_df = pd.DataFrame({'Word': Word, 'Count': Count})
infor_best_2w_df = infor_2w_df.sort_values(by='Count',ascending=False)[:1000]
infor_best_2w_df.reset_index(drop=True, inplace=True)
infor_best_2w_df

Unnamed: 0,Word,Count
0,thời trang,2149
1,sơ mi,1560
2,áo sơ,1520
3,cao cấp,1427
4,có sẵn,1186
...,...,...
995,coolmate cw,18
996,hở vai,18
997,nữ đen,18
998,mềm co,18


In [37]:
top_shopee_mall_id = list(map(str, top_shopee_mall_id))
normal_shop_id = list(map(str, normal_shop_id))
list_total_sold = []
list_rating_average = []
list_arevenue = []
list_n_shop_mall = []
list_sm_ra = []
list_sm_sold = []
list_sm_arevenue = []
list_n_normal_shop = []
list_ns_ra = []
list_ns_sold = []
list_ns_arevenue = []
for keyword in infor_best_2w_df['Word']: 
    # Kiểm tra xem các giá trị trong cột 'Column' có chứa chuỗi cụ thể hay không
    contains_search_string = analyzed_df['name'].str.lower().str.contains(keyword)
    total_sold = analyzed_df[contains_search_string]['sold'].sum()
    list_total_sold.append(total_sold)
    rating_average = analyzed_df[contains_search_string]['rating_star'].mean()
    list_rating_average.append(rating_average)
    arevenue = analyzed_df[contains_search_string]['approximate_revenue'].sum()
    list_arevenue.append(arevenue)
    # SM
    n_shop_mall = analyzed_df[contains_search_string]['shopid'].isin(top_shopee_mall_id).sum()
    list_n_shop_mall.append(n_shop_mall)
    sm_ra = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['rating_star'].mean()
    list_sm_ra.append(sm_ra)
    sm_sold = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['sold'].sum()
    list_sm_sold.append(sm_sold)
    sm_arevenue = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(top_shopee_mall_id)]['approximate_revenue'].sum()
    list_sm_arevenue.append(sm_arevenue)
    # NS
    n_normal_shop = analyzed_df[contains_search_string]['shopid'].isin(normal_shop_id).sum()
    list_n_normal_shop.append(n_normal_shop)
    ns_ra = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['rating_star'].mean()
    list_ns_ra.append(ns_ra)
    ns_sold = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['sold'].sum()
    list_ns_sold.append(ns_sold)
    ns_arevenue = analyzed_df[contains_search_string & analyzed_df['shopid'].isin(normal_shop_id)]['approximate_revenue'].sum()
    list_ns_arevenue.append(ns_arevenue)
    

In [38]:
# infor_popular_best_2w_df = infor_best_2w_df.copy()
# infor_popular_best_2w_df['Sold'] = list_sold
# infor_popular_best_2w_df['Rating_Average'] = list_rating_average
# infor_popular_best_2w_df['Shopee_Mall'] = list_shop_mall
# infor_popular_best_2w_df['Normal_Shop'] = list_normal_shop
# infor_popular_best_2w_df.sort_values(by='Sold',ascending=False)
infor_2w_df = infor_best_2w_df.copy()
infor_2w_df['Total Sold'] = list_total_sold
infor_2w_df['Approximate Revenue'] = list_arevenue
infor_2w_df['Rating Average'] = list_rating_average
infor_2w_df['Shopee Mall'] = list_n_shop_mall
infor_2w_df['SM Rating Average'] = list_sm_ra
infor_2w_df['SM Approximate Revenue'] = list_sm_arevenue
infor_2w_df['SM Sold'] = list_sm_sold
infor_2w_df['Normal Shop'] = list_n_normal_shop
infor_2w_df['NS Rating Average'] = list_ns_ra
infor_2w_df['NS Approximate Revenue'] = list_ns_arevenue
infor_2w_df['NS Sold'] = list_ns_sold
infor_2w_df['Count'] = infor_2w_df['Shopee Mall'] + infor_2w_df['Normal Shop']




In [39]:
infor_2w_df = infor_2w_df[infor_2w_df['Count'] != 0]
len(infor_2w_df)

963

In [40]:
stopwords_2w = ["áo sơ", 'hàng có', 'mi nam', 'trang mùa','ảnh 100','cấp chuẩn','owen áo','qc cao','dành cho','nam tay','mi dài','sẵn quần','nam owen',\
                'thun nam', 'polo nam', 'cách hàn','tay nam','nam cổ','thao nam','trong vòng','vòng 24','khoác nam','đùi nam','hè cho','cổ bẻ','hàng sẵn',\
                'mi ngắn','xanh mã','trang xuân','short nam','lót nam','trang hàn',"nam thể"]


In [41]:
infor_2w_df = infor_2w_df.drop(infor_2w_df[infor_2w_df['Word'].isin(stopwords_2w)].index)
infor_2w_df.reset_index(drop=True, inplace=True)
infor_2w_df[:20]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
0,thời trang,2121,32098,4290857000.0,1.511215,523,4.599648,4230485000.0,31823,1598,0.50042,60371632.0,275
1,sơ mi,1549,6440,2270745000.0,3.089836,768,4.369255,2199263000.0,6211,781,1.831713,71482194.0,229
2,cao cấp,1416,35579,6607730000.0,2.832964,500,4.518186,6410606000.0,34815,916,1.913083,197124724.0,764
3,có sẵn,761,89,16804000.0,0.24344,0,,0.0,0,761,0.24344,16803995.0,89
4,phong cách,1094,9081,1444885000.0,1.334832,252,4.394619,1423875000.0,8980,842,0.419077,21010391.0,101
5,áo thun,1080,55001,10409890000.0,2.830609,588,4.609981,10401260000.0,54970,492,0.704042,8624463.0,31
6,cho nam,1085,492,76994480.0,0.569095,10,4.889855,34555630.0,244,1075,0.528902,42438853.0,248
7,áo khoác,805,13071,3569927000.0,2.311608,268,4.191883,3438588000.0,12589,537,1.373221,131338160.0,482
8,thể thao,781,71693,10637200000.0,3.21889,427,4.730099,10609090000.0,71566,354,1.396048,28111396.0,127
9,hàn quốc,788,12067,1590278000.0,2.463674,317,4.547781,1563235000.0,11961,471,1.060995,27043097.0,106


In [42]:
infor_2w_df.sort_values(by='Total Sold',ascending=False)[:20]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
8,thể thao,781,71693,10637200000.0,3.21889,427,4.730099,10609090000.0,71566,354,1.396048,28111396.0,127
5,áo thun,1080,55001,10409890000.0,2.830609,588,4.609981,10401260000.0,54970,492,0.704042,8624463.0,31
19,thương hiệu,540,47412,9494860000.0,4.117166,380,4.683213,9490071000.0,47328,160,2.772804,4788400.0,84
33,hiệu coolmate,322,43113,9031805000.0,4.655202,322,4.655202,9031805000.0,43113,0,,0.0,0
11,co giãn,701,42800,6280394000.0,4.008098,423,4.583273,6265684000.0,42601,278,3.132922,14710043.0,199
2,cao cấp,1416,35579,6607730000.0,2.832964,500,4.518186,6410606000.0,34815,916,1.913083,197124724.0,764
38,thoải mái,281,33451,5007692000.0,4.309984,230,4.580698,5005186000.0,33423,51,3.089119,2505894.0,28
0,thời trang,2121,32098,4290857000.0,1.511215,523,4.599648,4230485000.0,31823,1598,0.50042,60371632.0,275
16,áo polo,548,31715,6513495000.0,3.904995,442,4.557154,6512171000.0,31709,106,1.185613,1324616.0,6
24,trẻ trung,450,29774,5744629000.0,3.522422,315,4.453061,5737637000.0,29755,135,1.350931,6991571.0,19


In [43]:
infor_2w_df.sort_values(by='Count',ascending=False)[:20]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
0,thời trang,2121,32098,4290857000.0,1.511215,523,4.599648,4230485000.0,31823,1598,0.50042,60371632.0,275
1,sơ mi,1549,6440,2270745000.0,3.089836,768,4.369255,2199263000.0,6211,781,1.831713,71482194.0,229
2,cao cấp,1416,35579,6607730000.0,2.832964,500,4.518186,6410606000.0,34815,916,1.913083,197124724.0,764
4,phong cách,1094,9081,1444885000.0,1.334832,252,4.394619,1423875000.0,8980,842,0.419077,21010391.0,101
6,cho nam,1085,492,76994480.0,0.569095,10,4.889855,34555630.0,244,1075,0.528902,42438853.0,248
5,áo thun,1080,55001,10409890000.0,2.830609,588,4.609981,10401260000.0,54970,492,0.704042,8624463.0,31
7,áo khoác,805,13071,3569927000.0,2.311608,268,4.191883,3438588000.0,12589,537,1.373221,131338160.0,482
9,hàn quốc,788,12067,1590278000.0,2.463674,317,4.547781,1563235000.0,11961,471,1.060995,27043097.0,106
8,thể thao,781,71693,10637200000.0,3.21889,427,4.730099,10609090000.0,71566,354,1.396048,28111396.0,127
3,có sẵn,761,89,16804000.0,0.24344,0,,0.0,0,761,0.24344,16803995.0,89


In [44]:
infor_2w_df.to_csv('Infor_2w.csv',index=False,header=True,encoding="utf-8-sig")

In [45]:
infor_2w_df[:20]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
0,thời trang,2121,32098,4290857000.0,1.511215,523,4.599648,4230485000.0,31823,1598,0.50042,60371632.0,275
1,sơ mi,1549,6440,2270745000.0,3.089836,768,4.369255,2199263000.0,6211,781,1.831713,71482194.0,229
2,cao cấp,1416,35579,6607730000.0,2.832964,500,4.518186,6410606000.0,34815,916,1.913083,197124724.0,764
3,có sẵn,761,89,16804000.0,0.24344,0,,0.0,0,761,0.24344,16803995.0,89
4,phong cách,1094,9081,1444885000.0,1.334832,252,4.394619,1423875000.0,8980,842,0.419077,21010391.0,101
5,áo thun,1080,55001,10409890000.0,2.830609,588,4.609981,10401260000.0,54970,492,0.704042,8624463.0,31
6,cho nam,1085,492,76994480.0,0.569095,10,4.889855,34555630.0,244,1075,0.528902,42438853.0,248
7,áo khoác,805,13071,3569927000.0,2.311608,268,4.191883,3438588000.0,12589,537,1.373221,131338160.0,482
8,thể thao,781,71693,10637200000.0,3.21889,427,4.730099,10609090000.0,71566,354,1.396048,28111396.0,127
9,hàn quốc,788,12067,1590278000.0,2.463674,317,4.547781,1563235000.0,11961,471,1.060995,27043097.0,106


In [46]:
# infor_2w_df.to_csv('Infor_2w.csv',index=False,header=True,encoding="utf-8-sig")

In [47]:
infor_2w_df.sort_values(by='Total Sold',ascending=False)[:50]

Unnamed: 0,Word,Count,Total Sold,Approximate Revenue,Rating Average,Shopee Mall,SM Rating Average,SM Approximate Revenue,SM Sold,Normal Shop,NS Rating Average,NS Approximate Revenue,NS Sold
8,thể thao,781,71693,10637200000.0,3.21889,427,4.730099,10609090000.0,71566,354,1.396048,28111396.0,127
5,áo thun,1080,55001,10409890000.0,2.830609,588,4.609981,10401260000.0,54970,492,0.704042,8624463.0,31
19,thương hiệu,540,47412,9494860000.0,4.117166,380,4.683213,9490071000.0,47328,160,2.772804,4788400.0,84
33,hiệu coolmate,322,43113,9031805000.0,4.655202,322,4.655202,9031805000.0,43113,0,,0.0,0
11,co giãn,701,42800,6280394000.0,4.008098,423,4.583273,6265684000.0,42601,278,3.132922,14710043.0,199
2,cao cấp,1416,35579,6607730000.0,2.832964,500,4.518186,6410606000.0,34815,916,1.913083,197124724.0,764
38,thoải mái,281,33451,5007692000.0,4.309984,230,4.580698,5005186000.0,33423,51,3.089119,2505894.0,28
0,thời trang,2121,32098,4290857000.0,1.511215,523,4.599648,4230485000.0,31823,1598,0.50042,60371632.0,275
16,áo polo,548,31715,6513495000.0,3.904995,442,4.557154,6512171000.0,31709,106,1.185613,1324616.0,6
24,trẻ trung,450,29774,5744629000.0,3.522422,315,4.453061,5737637000.0,29755,135,1.350931,6991571.0,19


In [48]:
shop_infor_df = pd.read_csv("Malls_Information.csv",encoding="utf-8-sig")
shop_infor_df = shop_infor_df[['shopid','name']]
shop_infor_df

Unnamed: 0,shopid,name
0,40342563,Aviano Menswear
1,225909574,POLOMANOR
2,68613764,TSLA Store Quần áo legging nam
3,24710134,Coolmate - Official Store
4,127217331,5S FASHION OFFICIAL
5,60297616,ROUGH
6,111639450,Pattern
7,317477677,Levents .vn
8,201774917,Guzado Official
9,59596762,Thời Trang MANDO


In [49]:
len(shop_infor_df)

23

In [50]:
# infor_2w_df.sort_values(by='Total Sold',ascending=False)[:50]
# # Nhớ remove 1 vài dòng bị trùng ý nghĩa hoặc không cần thiết trong top 50:
# # Thương hiệu
top_20_infor_1w_df = infor_1w_df.sort_values(by='Count',ascending=False)[:20]

In [51]:
top_1word = list(top_20_infor_1w_df['Word'])

In [52]:
list_shop_name = []
list_item_name = []
list_keyword = []
list_sold = []
list_rating = []
list_arevenue = []

In [53]:
for keyword in top_1word:
    for i in range(0,len(shop_infor_df)):
        temp_df = analyzed_df[analyzed_df['shopid'] == str(shop_infor_df['shopid'][i])]
        contains_search_string = temp_df['name'].str.lower().str.contains(keyword)
        
        for item_name in temp_df[contains_search_string]['name']:
            list_shop_name.append(shop_infor_df['name'][i])
            list_item_name.append(item_name)
            list_keyword.append(keyword)
            list_sold.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0])
            list_rating.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['rating_star'].values[0])
            arevenue = temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0] * \
            temp_df[contains_search_string & (temp_df['name'] == item_name)]['price'].values[0]
            list_arevenue.append(arevenue)

In [54]:
SM_1w_df = pd.DataFrame({'shop_name': list_shop_name, 'item_name':list_item_name, 'keyword': list_keyword,\
                               'sold': list_sold, 'rating': list_rating, 'approximate revenue': list_arevenue})

In [55]:
SM_1w_df

Unnamed: 0,shop_name,item_name,keyword,sold,rating,approximate revenue
0,Aviano Menswear,Bộ Thể Thao Nam 4 Màu Trẻ Trung Nặng Động VICERO,nam,62,4.906764,1.047800e+07
1,Aviano Menswear,Áo Khoác Nam Chất Liệu Cao Cấp Kiểu Dáng Trẻ T...,nam,5,4.848684,6.450000e+05
2,Aviano Menswear,Áo Khoác Thể Thao Nam 3 Màu Trẻ Trung Cao Cấp ...,nam,84,4.839286,1.083600e+07
3,Aviano Menswear,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",nam,3912,4.847720,7.628400e+08
4,Aviano Menswear,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,nam,6595,4.846812,1.220075e+09
...,...,...,...,...,...,...
13136,HARAS VietNam - Official Store,Balo nam nữ thời trang Hàn quốc chống thấm nướ...,ấm,0,0.000000,0.000000e+00
13137,AREMI,Áo thun polo có cổ tay dài nam AREMI chất liệu...,ấm,0,0.000000,0.000000e+00
13138,AREMI,Áo thun nam tay raglan AREMI t-shirt trơn cổ t...,ấm,9,5.000000,1.341000e+06
13139,AREMI,Áo thun nam cổ tròn tay ngắn AREMI vải cotton ...,ấm,27,4.910979,4.833000e+06


In [56]:
# SM_1w_df.to_csv('SM_1w.csv',index=False,header=True,encoding="utf-8-sig")

In [57]:
# infor_2w_df.sort_values(by='Total Sold',ascending=False)[:50]
# # Nhớ remove 1 vài dòng bị trùng ý nghĩa hoặc không cần thiết trong top 50:
# # Thương hiệu
top_20_infor_2w_df = infor_2w_df.sort_values(by='Count',ascending=False)[:20]

In [58]:
top_2word = list(top_20_infor_2w_df['Word'])

In [59]:
list_shop_name = []
list_item_name = []
list_keyword = []
list_sold = []
list_rating = []
list_arevenue = []

In [60]:
for keyword in top_2word:
    for i in range(0,len(shop_infor_df)):
        temp_df = analyzed_df[analyzed_df['shopid'] == str(shop_infor_df['shopid'][i])]
        contains_search_string = temp_df['name'].str.lower().str.contains(keyword)
        
        for item_name in temp_df[contains_search_string]['name']:
            list_shop_name.append(shop_infor_df['name'][i])
            list_item_name.append(item_name)
            list_keyword.append(keyword)
            list_sold.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0])
            list_rating.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['rating_star'].values[0])
            arevenue = temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0] * \
            temp_df[contains_search_string & (temp_df['name'] == item_name)]['price'].values[0]
            list_arevenue.append(arevenue)

In [61]:
SM_2w_df = pd.DataFrame({'shop_name': list_shop_name, 'item_name':list_item_name, 'keyword': list_keyword,\
                               'sold': list_sold, 'rating': list_rating, 'approximate revenue': list_arevenue})

In [62]:
SM_2w_df = SM_2w_df.drop(SM_2w_df[SM_2w_df['shop_name'] == 'HARAS VietNam - Official Store'].index)
SM_2w_df.reset_index(drop=True, inplace=True)

In [63]:
# SM_2w_df.to_csv('SM_2w.csv',index=False,header=True,encoding="utf-8-sig")

In [64]:
list_sm_name = []
list_sm_total_sold = []
list_sm_rating = []
list_sm_arevenue = []
for i in range(0,len(shop_infor_df)):
    temp_df = analyzed_df[analyzed_df['shopid'] == str(shop_infor_df['shopid'][i])]
    sm_name = shop_infor_df['name'][i]
    list_sm_name.append(sm_name)
    total_sold = temp_df.groupby(by=['shopid'])['sold'].sum().values[0]
    list_sm_total_sold.append(total_sold)
    rating_average = temp_df.groupby(by = ['shopid'])['rating_star'].mean().values[0]
    list_sm_rating.append(rating_average)
    arevenue = temp_df.groupby(by = ['shopid'])['approximate_revenue'].sum().values[0]
    list_sm_arevenue.append(arevenue)
    


In [65]:
SM_Infor_df = pd.DataFrame({'SM_Name': list_sm_name, 'SM_Total_Sold': list_sm_total_sold, \
                            'SM_Rating_Average': list_sm_rating, 'SM_Approximate_Revenue': list_sm_arevenue})
SM_Infor_df

Unnamed: 0,SM_Name,SM_Total_Sold,SM_Rating_Average,SM_Approximate_Revenue
0,Aviano Menswear,32564,4.868854,5567569000.0
1,POLOMANOR,15959,4.843272,4110210000.0
2,TSLA Store Quần áo legging nam,2361,4.670614,367954800.0
3,Coolmate - Official Store,72927,4.707948,13560440000.0
4,5S FASHION OFFICIAL,14083,4.789382,2848612000.0
5,ROUGH,9333,4.558269,1196077000.0
6,Pattern,5745,4.780964,1010936000.0
7,Levents .vn,22354,4.607361,8519942000.0
8,Guzado Official,36727,4.770072,3922139000.0
9,Thời Trang MANDO,1564,4.76173,301989000.0


In [66]:
# SM_Infor_df.to_csv('SM_Infor.csv',index=False,header=True,encoding="utf-8-sig")

In [67]:
def tokenize_range_word(documents, i, j, stopwords):
    preprocessed_documents = [doc.lower() for doc in documents]
    vectorizer = CountVectorizer(ngram_range= (i,j), stop_words= stopwords)
    word_counts = vectorizer.fit_transform(preprocessed_documents)
    words = vectorizer.get_feature_names_out()
    word_occurrences = word_counts.toarray().sum(axis=0)
    word_count_dict = dict(zip(words, word_occurrences))
    Word = []
    Count = []
    for word, count in word_count_dict.items():
        Word.append(word)
        Count.append(count)
    return Word, Count

In [68]:
SM_Unique_Word_df = pd.DataFrame({"Shop_Name": [],'Word': [],'Count': []})
SM_Unique_Word_df

Unnamed: 0,Shop_Name,Word,Count


In [69]:
for i in range(0,len(shop_infor_df)):
        temp_df = analyzed_df[analyzed_df['shopid'] == str(shop_infor_df['shopid'][i])]
        documents = list(temp_df['name']) 
        stopwords = None
        Word, Count = tokenize_range_word(documents, 1,2, stopwords)
        Shop_Name = len(Word)* [shop_infor_df['name'][i]]
        infor_1_2w_df = pd.DataFrame({'Shop_Name': Shop_Name ,'Word': Word, 'Count': Count})
        infor_best_1_2w_df = infor_1_2w_df.sort_values(by='Count',ascending=False)[:50]
        infor_best_1_2w_df.reset_index(drop=True, inplace=True)
        SM_Unique_Word_df = pd.concat([SM_Unique_Word_df, infor_best_1_2w_df])

In [70]:
stopwords_unique = ['trang','thời','chất','thao','thể','nam aviano','cấp','cao','thiết','đồ','thao nam','có','bộ quần','hôi',\
                    'cá', 'sấu cotton','tính thanh', 'cmc nam','nam cổ', 'interlock xuất','xịn nam', 'nam cổ','nam phối',\
                   'phối','bẻ','bóng', 'chạy', 'giữ', 'rổ','lót', 'nhiệt','lông', 'lót lông','khuẩn','đá','nam tsla','khí',\
                    'co','giãn','nước','mái','chống','lót nam','chất','liệu','lịch','thanh','dễ','kế basic','short',\
                    'sơ', 'áo sơ', 'tăm', 'màu pattern', 'pattern the', 'khoác','da lộn','levents basic', 'polo levents', 'vớ levents',\
                    'levents my','động', 'cực', 'vận', 'kháng','mi', 'mando chất', 'unisex thời', 'mi nam','gym vải',\
                    'chuẩn', 'lạnh thoáng','tròn', 'giãn chuẩn','sấu','đùi','mi', 'mi ngắn', 'ghi','tay body','owen dáng',\
                        'tay body', 'owen dáng', 'tay regular','tây', 'polo ngắn','mịn', 'trung','sợi', 'đãi', 'trung feaer',\
                            'sản', 'nam có','ưu','đãi áo', 'mại', 'hàng','dặn','thỏ','cách','đứng', 'khoác nam','liểu', 'nắng',\
                                'siêu','sinh','lớp phối','đeo','chéo', 'trọng', 'sở','kim', 'đựng','bò dáng','cấp thời', 'hình thời',\
                                    'nam thời', 'everest nhiều','nộ', 'giản', 'nằm','chống nhãn','quốc', 'công', 'xù không', 'loại',\
                            'trang haras', 'và túi', 'du', 'trang hàn', 'xách', 'lịch thời', 'cấp haras', 'ba', 'lô','nữ màu', 'nữ đen',\
                                'tay lỡ', 'saigonese quần','saigonese cotton', 'đùi nam','nam tay','chiều', 'trụ', 'cotton co', 'liệu cá']

In [71]:
SM_Unique_Word_df.reset_index(drop=True, inplace=True)
SM_Unique_Word_df = SM_Unique_Word_df.drop(SM_Unique_Word_df[SM_Unique_Word_df['Word'].isin(stopwords_unique)].index)
SM_Unique_Word_df.reset_index(drop=True, inplace=True)
SM_Unique_Word_df

Unnamed: 0,Shop_Name,Word,Count
0,Aviano Menswear,nam,225.0
1,Aviano Menswear,áo,141.0
2,Aviano Menswear,aviano,118.0
3,Aviano Menswear,quần,95.0
4,Aviano Menswear,thời trang,90.0
...,...,...,...
813,AREMI,ngắn cổ,20.0
814,AREMI,trẻ,19.0
815,AREMI,trẻ trung,19.0
816,AREMI,có cổ,18.0


In [72]:
list_shop_name = []
list_item_name = []
list_keyword = []
list_sold = []
list_rating = []
list_arevenue = []        

In [73]:
for i in range(0,len(shop_infor_df)):
    temp_df = analyzed_df[analyzed_df['shopid'] == str(shop_infor_df['shopid'][i])]
    for keyword in SM_Unique_Word_df[SM_Unique_Word_df['Shop_Name'] == shop_infor_df['name'][i]]['Word']:
        contains_search_string = temp_df['name'].str.lower().str.contains(keyword)
        
        for item_name in temp_df[contains_search_string]['name']:
            list_shop_name.append(shop_infor_df['name'][i])
            list_item_name.append(item_name)
            list_keyword.append(keyword)
            list_sold.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0])
            list_rating.append(temp_df[contains_search_string & (temp_df['name'] == item_name)]['rating_star'].values[0])
            arevenue = temp_df[contains_search_string & (temp_df['name'] == item_name)]['sold'].values[0] * \
            temp_df[contains_search_string & (temp_df['name'] == item_name)]['price'].values[0]
            list_arevenue.append(arevenue)

In [74]:
All_SM_Unique_Word_df = pd.DataFrame({'shop_name': list_shop_name, 'item_name':list_item_name, 'keyword': list_keyword,\
                               'sold': list_sold, 'rating': list_rating, 'approximate revenue': list_arevenue})

In [75]:
All_SM_Unique_Word_df

Unnamed: 0,shop_name,item_name,keyword,sold,rating,approximate revenue
0,Aviano Menswear,Bộ Thể Thao Nam 4 Màu Trẻ Trung Nặng Động VICERO,nam,62,4.906764,1.047800e+07
1,Aviano Menswear,Áo Khoác Nam Chất Liệu Cao Cấp Kiểu Dáng Trẻ T...,nam,5,4.848684,6.450000e+05
2,Aviano Menswear,Áo Khoác Thể Thao Nam 3 Màu Trẻ Trung Cao Cấp ...,nam,84,4.839286,1.083600e+07
3,Aviano Menswear,"Bộ Đồ Nam AVIANO 4 Màu Dài Tay, Bộ Thể Thao Na...",nam,3912,4.847720,7.628400e+08
4,Aviano Menswear,Bộ Thể Thao Nam Chất Nỉ 4 Màu Trẻ Trung Năng Đ...,nam,6595,4.846812,1.220075e+09
...,...,...,...,...,...,...
46615,AREMI,"Áo thun nam cổ tròn, T-shirt nam đen in AREMI,...",thun nam,8,4.939759,1.432000e+06
46616,AREMI,Áo polo nam thun nam có cổ AREMI chất liệu cá ...,thun nam,37,4.870968,7.363000e+06
46617,AREMI,Áo thun nam tay ngắn cổ tròn AREMI chất liệu c...,thun nam,28,4.636364,5.012000e+06
46618,AREMI,Áo polo nam thun nam tay ngắn có cổ AREMI chất...,thun nam,16,5.000000,2.864000e+06


In [76]:
# All_SM_Unique_Word_df.to_csv('SM_Unique_Word.csv',index=False,header=True,encoding="utf-8-sig")