In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

test_data_path="gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test.csv"
label_column="Product_ID"

In [2]:
df=pd.read_csv(test_data_path)
df

Unnamed: 0.1,Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,304602,1004912,P00129442,F,51-55,20,A,0,1,5,8.0,
1,391884,1000302,P00106042,M,18-25,4,B,4+,0,5,,
2,231657,1005730,P00112142,M,18-25,0,A,3,0,1,2.0,14.0
3,143922,1004186,P00170342,M,26-35,7,B,1,1,1,15.0,
4,151292,1005376,P00051642,M,26-35,17,B,1,1,8,,
...,...,...,...,...,...,...,...,...,...,...,...,...
548563,259178,1003921,P00022542,M,26-35,14,C,2,1,5,,
548564,365838,1002252,P00319042,F,18-25,1,B,1,0,5,,
548565,131932,1002257,P00001742,F,18-25,4,B,2,0,8,14.0,
548566,121087,1000720,P00253042,M,18-25,0,B,0,0,8,,


In [3]:
# removing products with only one instance

label_numerosities=df[label_column].value_counts()
multiple_products = label_numerosities[label_numerosities>1].index
clean_df = df[df[label_column].isin(multiple_products)].reset_index(drop=True)

In [4]:
feature_cols = [c for c in clean_df.columns if c != label_column]
label_cols   = [label_column]
x, y = clean_df[feature_cols].values, clean_df[label_cols].values
for eval_index, test_index in StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42).split(x,y):
    print((eval_index.shape, test_index.shape))
    df_eval_clean=clean_df.iloc[eval_index]
    df_test_clean=clean_df.iloc[test_index]

((274237,), (274238,))


### sanity checks

In [5]:
pd.DataFrame({
    "eval":df_eval_clean[label_column].value_counts(),
    "test":df_test_clean[label_column].value_counts()
})

Unnamed: 0,eval,test
P00000142,577,577
P00000242,186,186
P00000342,122,122
P00000442,46,46
P00000542,76,76
...,...,...
P0099442,96,97
P0099642,8,8
P0099742,59,60
P0099842,45,46


In [6]:
(df_test_clean[label_column].value_counts() - df_eval_clean[label_column].value_counts()).value_counts()

 0    1796
 1     849
-1     848
Name: Product_ID, dtype: int64

### handling single products

In [7]:
single_label_df=df[~df[label_column].isin(multiple_products)].reset_index(drop=True)
single_label_eval_df = single_label_df.iloc[int(single_label_df.shape[0]/2):]
single_label_test_df = single_label_df.iloc[:int(single_label_df.shape[0]/2)] 

In [8]:
df_eval = pd.concat([df_eval_clean, single_label_eval_df])
df_test = pd.concat([df_test_clean, single_label_test_df])

### saving results

In [9]:
df_eval.to_csv("gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv")
df_eval.to_csv("gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/testset.csv")