## smart platform testing

## lib

In [31]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

### data preprocessing

In [32]:
def get_csv_files(folder_path):
    csv_files=[]
    for files in os.listdir(folder_path):
        if files.endswith('.csv'):
            csv_files.append(os.path.join(folder_path,files))
    return csv_files

csv_files= get_csv_files('../amazon-products-dataset/versions/2')

In [33]:
# store the data in dat frame
dataframes= []
for path in csv_files:
    df = pd.read_csv(path)
    dataframes.append(df)
    
combined_df= pd.concat(dataframes,ignore_index=True)


### basic info

In [34]:
combined_df.head()

Unnamed: 0.1,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,Unnamed: 0
0,Puma Women's Glam WNS Sport Sandal,women's shoes,Fashion Sandals,https://m.media-amazon.com/images/I/51cVFnTFCS...,https://www.amazon.in/Puma-Womens-Glam-Lotus-S...,4.5,2,"₹1,829","₹2,999",
1,Luxyfeel Fashion Sandal for Girls and Women | ...,women's shoes,Fashion Sandals,https://m.media-amazon.com/images/I/51IBSgyvAK...,https://www.amazon.in/Luxyfeel-Fashion-Sandal-...,3.8,44,₹549,"₹1,499",
2,STRASSE PARIS Amazing Design Women's & Girls T...,women's shoes,Fashion Sandals,https://m.media-amazon.com/images/I/616TZINkHa...,https://www.amazon.in/STRASSE-PARIS-Transparen...,4.5,2,₹636,"₹1,899",
3,Mochi Women White Wedge Heel Slip-on UK/5 EU/3...,women's shoes,Fashion Sandals,https://m.media-amazon.com/images/I/51nDP-iEI6...,https://www.amazon.in/Mochi-Women-White-Wedge-...,5.0,1,"₹1,194","₹1,990",
4,Catwalk Women's Fashion UK EU US 3882,women's shoes,Fashion Sandals,https://m.media-amazon.com/images/I/712tmgiNOn...,https://www.amazon.in/Catwalk-Womens-Fashion-S...,3.7,114,₹899,"₹2,195",


In [35]:
combined_df = combined_df.drop(combined_df.columns[-1], axis=1)
combined_df.drop(['name','image','link'],axis=1,inplace=True)

In [36]:
combined_df.shape

(1103170, 6)

In [37]:
combined_df=combined_df.dropna()

In [38]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 681360 entries, 0 to 1103169
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   main_category   681360 non-null  object
 1   sub_category    681360 non-null  object
 2   ratings         681360 non-null  object
 3   no_of_ratings   681360 non-null  object
 4   discount_price  681360 non-null  object
 5   actual_price    681360 non-null  object
dtypes: object(6)
memory usage: 36.4+ MB


In [39]:
combined_df.columns = (combined_df.columns.str.strip().str.lower().str.replace(" ","_"))

In [40]:
combined_df= combined_df[
    pd.to_numeric(combined_df['ratings'],errors= 'coerce').between(0,5)
]
combined_df['ratings'] = combined_df['ratings'].astype(float)

In [41]:
combined_df['no_of_ratings']= (
    combined_df['no_of_ratings'].astype(str)
    .str.replace(',','')
    .where(lambda x:x.str.fullmatch(r'\d+'))
    .astype(float)
)

In [42]:
price_pattern = r'^₹\d{1,3}(,\d{3})*$'
cols = ['discount_price','actual_price']

masked = (
    combined_df[cols]
    .astype(str)
    .apply(lambda x:x.str.match(price_pattern))
    .all(axis=1)
)

combined_df = combined_df[masked]

combined_df[cols] = (
    combined_df[cols]
    .replace({'₹':'',',':''},regex=True)
    .astype(float)
)

In [43]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 650204 entries, 0 to 1102940
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   main_category   650204 non-null  object 
 1   sub_category    650204 non-null  object 
 2   ratings         650204 non-null  float64
 3   no_of_ratings   650108 non-null  float64
 4   discount_price  650204 non-null  float64
 5   actual_price    650204 non-null  float64
dtypes: float64(4), object(2)
memory usage: 34.7+ MB


In [44]:
combined_df['main_category'].unique()

array(["women's shoes", 'car & motorbike', 'appliances',
       'grocery & gourmet foods', "kids' fashion", 'bags & luggage',
       'toys & baby products', "women's clothing", 'tv, audio & cameras',
       "men's shoes", 'sports & fitness', 'beauty & health',
       'accessories', "men's clothing", 'home & kitchen', 'music',
       'pet supplies', 'stores', 'industrial supplies',
       'home, kitchen, pets'], dtype=object)

In [45]:
combined_df['sub_category'].unique()

array(['Fashion Sandals', 'All Car & Motorbike Products',
       'Washing Machines', 'Refrigerators', 'All Grocery & Gourmet Foods',
       "Kids' Watches", 'Suitcases & Trolley Bags',
       'International Toy Store', 'Clothing', 'Camera Accessories',
       'Formal Shoes', 'Travel Accessories', 'Yoga', 'Value Bazaar',
       'Make-up', 'Personal Care Appliances', 'Coffee, Tea & Beverages',
       'Running', 'Diet & Nutrition', 'Car Electronics',
       'Gold & Diamond Jewellery', 'Beauty & Grooming',
       'Baby Bath, Skin & Grooming', 'Innerwear',
       'Health & Personal Care', 'Ballerinas', "Kids' Shoes",
       'Home Storage', 'Home Audio & Theater', 'Watches',
       'Home Entertainment Systems', 'Security Cameras', 'Travel Duffles',
       'Musical Instruments & Professional Audio', 'Kitchen & Dining',
       'Sewing & Craft Supplies', 'Strollers & Prams', 'Rucksacks',
       'All Pet Supplies', 'Toys & Games', 'Air Conditioners',
       'All Appliances', 'All Electronics', '

In [None]:
encoder = OneHotEncoder(
    sparse_output= False,
    handle_unknown = 'ignore'
)

encoded_array = encoder.fit_transform(combined_df[['main_category']])

encoded_df =  pd.DataFrame(
    encoded_array,
    columns = encoder.get_feature_names_out(['main_category']),
    index= combined_df.index
)
combined_df = combined_df.join(encoded_df)


Unnamed: 0,main_category,sub_category,ratings,no_of_ratings,discount_price,actual_price,main_category_accessories,main_category_appliances,main_category_bags & luggage,main_category_beauty & health,main_category_car & motorbike,main_category_grocery & gourmet foods,main_category_home & kitchen,"main_category_home, kitchen, pets",main_category_industrial supplies,main_category_kids' fashion,main_category_men's clothing,main_category_men's shoes,main_category_music,main_category_pet supplies,main_category_sports & fitness,main_category_stores,main_category_toys & baby products,"main_category_tv, audio & cameras",main_category_women's clothing,main_category_women's shoes
0,women's shoes,Fashion Sandals,4.5,2.0,1829.0,2999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,women's shoes,Fashion Sandals,3.8,44.0,549.0,1499.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,women's shoes,Fashion Sandals,4.5,2.0,636.0,1899.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,women's shoes,Fashion Sandals,5.0,1.0,1194.0,1990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,women's shoes,Fashion Sandals,3.7,114.0,899.0,2195.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102846,sports & fitness,Badminton,3.7,6.0,4698.0,6719.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1102850,sports & fitness,Badminton,4.8,23.0,7548.0,10789.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1102877,sports & fitness,Badminton,3.5,12.0,2448.0,3499.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1102880,sports & fitness,Badminton,4.2,8.0,2698.0,3859.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
combined_df.reset_index(inplace=True,drop= True)


In [48]:
from sklearn.model_selection import KFold
import numpy as np
combined_df['sub_category_te'] = np.nan
mean_rating = combined_df['ratings'].mean()

kf = KFold(n_splits = 5, shuffle= True, random_state =42)

for train_idx, val_idx in kf.split(combined_df):
    train_fold = combined_df.iloc[train_idx]
    val_fold = combined_df.iloc[val_idx]

    fold_mean = (
        train_fold.groupby('sub_category')['ratings'].mean()
    )

    combined_df.loc[val_idx,'sub_category_te'] = (
        combined_df.loc[val_idx,'sub_category']
        .map(fold_mean)
        .fillna(mean_rating)
    )

In [50]:
combined_df.drop(['main_category','sub_category'],axis=1,inplace=True)


In [51]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650204 entries, 0 to 650203
Data columns (total 25 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   ratings                                650204 non-null  float64
 1   no_of_ratings                          650108 non-null  float64
 2   discount_price                         650204 non-null  float64
 3   actual_price                           650204 non-null  float64
 4   main_category_accessories              650204 non-null  float64
 5   main_category_appliances               650204 non-null  float64
 6   main_category_bags & luggage           650204 non-null  float64
 7   main_category_beauty & health          650204 non-null  float64
 8   main_category_car & motorbike          650204 non-null  float64
 9   main_category_grocery & gourmet foods  650204 non-null  float64
 10  main_category_home & kitchen           650204 non-null  

In [52]:
combined_df.head()

Unnamed: 0,ratings,no_of_ratings,discount_price,actual_price,main_category_accessories,main_category_appliances,main_category_bags & luggage,main_category_beauty & health,main_category_car & motorbike,main_category_grocery & gourmet foods,main_category_home & kitchen,"main_category_home, kitchen, pets",main_category_industrial supplies,main_category_kids' fashion,main_category_men's clothing,main_category_men's shoes,main_category_music,main_category_pet supplies,main_category_sports & fitness,main_category_stores,main_category_toys & baby products,"main_category_tv, audio & cameras",main_category_women's clothing,main_category_women's shoes,sub_category_te
0,4.5,2.0,1829.0,2999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.548458
1,3.8,44.0,549.0,1499.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.565466
2,4.5,2.0,636.0,1899.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.577666
3,5.0,1.0,1194.0,1990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.587744
4,3.7,114.0,899.0,2195.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.565466
