# Imports

In [19]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math
from scipy import stats
from scipy.stats import shapiro
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import mannwhitneyu

In [4]:
item_prices = pd.read_csv("item_prices.csv")
sample_a = pd.read_csv("sample_a.csv")
sample_b = pd.read_csv("sample_b.csv")
sample_c = pd.read_csv("sample_c.csv")

# 0 introduction

In [17]:
print('Total duplicates item_prices: {}'.format(item_prices.duplicated().sum().sum()))
print('Total duplicates sample_b: {}'.format(sample_b.duplicated().sum().sum()))
print('Total duplicates sample_a: {}'.format(sample_a.duplicated().sum().sum()))
print('Total duplicates sample_c: {}'.format(sample_c.duplicated().sum().sum()))
print('-'*40)
print('Total missing values item_prices: {}'.format(item_prices.isnull().sum().sum()))
print('Total missing values sample_b: {}'.format(sample_b.isnull().sum().sum()))
print('Total missing values sample_c: {}'.format(sample_c.isnull().sum().sum()))
print('Total missing values sample_a: {}'.format(sample_a.isnull().sum().sum()))
print('-'*40)  
print('Shape item_prices: {}'.format(item_prices.shape))
print('Shape sample_b: {}'.format(sample_b.shape))
print('Shape sample_c: {}'.format(sample_c.shape)) 
print('Shape sample_a: {}'.format(sample_a.shape))

Total duplicates item_prices: 0


Total duplicates sample_b: 0
Total duplicates sample_a: 0
Total duplicates sample_c: 0
----------------------------------------
Total missing values item_prices: 0
Total missing values sample_b: 0
Total missing values sample_c: 0
Total missing values sample_a: 0
----------------------------------------
Shape item_prices: (1000, 2)
Shape sample_b: (1198438, 3)
Shape sample_c: (1205510, 3)
Shape sample_a: (1188912, 3)


## 0.1 item_prices

- item_id - идентификатор товара;
- item_price - цена товара.

In [23]:
item_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   item_id     1000 non-null   int64
 1   item_price  1000 non-null   int64
dtypes: int64(2)
memory usage: 15.8 KB


In [32]:
item_prices['item_id'].value_counts()

338     3
8581    3
5258    2
7204    2
8111    2
       ..
146     1
8048    1
4549    1
8094    1
3071    1
Name: item_id, Length: 955, dtype: int64

In [34]:
item_prices[item_prices['item_id'] == 8581]

Unnamed: 0,item_id,item_price
29,8581,1067
541,8581,994
759,8581,1736


Видим, что один и тот же товар повторяется несколько раз но с разной ценой.
Сгрупируем данные по товару и выведем медиану всех цен.

In [37]:
item_prices = item_prices.groupby(by='item_id').median()
item_prices

Unnamed: 0_level_0,item_price
item_id,Unnamed: 1_level_1
21,452.0
36,282.0
39,1746.0
50,871.0
64,1479.0
...,...
9936,379.0
9944,810.0
9958,265.0
9989,479.0


## 0.2 sample_a 

- user_id - идентификатор пользователя;
- item_id - идентификатор товара;
- action_id - идентификатор действия (0 — клик, 1 — просмотр и 2 — покупка).

In [40]:
sample_a['action_id'].value_counts()

1    951130
0    190226
2     47556
Name: action_id, dtype: int64

In [41]:
def transform_data(df):
    df['viewing'] = df['action_id'].apply(lambda x: 1 if x==1 else 0)
    df['click'] = df['action_id'].apply(lambda x: 1 if x==0 else 0)
    df['purchase'] = df['action_id'].apply(lambda x: 1 if x==2 else 0)
    df.drop(columns='action_id', axis=1, inplace=True)
    return df

In [42]:
sample_a = transform_data(sample_a)
sample_a

Unnamed: 0,user_id,item_id,viewing,click,purchase
0,84636,360,1,0,0
1,21217,9635,1,0,0
2,13445,8590,1,0,0
3,38450,5585,1,0,0
4,14160,2383,0,1,0
...,...,...,...,...,...
1188907,22999,2401,1,0,0
1188908,23700,4654,0,1,0
1188909,18842,3707,1,0,0
1188910,32732,9198,1,0,0


Т.к. для расчета одной из требуемых по заданию метрик (gmv) нам потребуется сумма по купленному товару, то воспользуемся справочником цен товаров.

In [43]:
sample_a = sample_a.join(item_prices, on='item_id', how='left')
sample_a

Unnamed: 0,user_id,item_id,viewing,click,purchase,item_price
0,84636,360,1,0,0,1896.0
1,21217,9635,1,0,0,1699.0
2,13445,8590,1,0,0,846.0
3,38450,5585,1,0,0,1556.0
4,14160,2383,0,1,0,1956.0
...,...,...,...,...,...,...
1188907,22999,2401,1,0,0,1670.0
1188908,23700,4654,0,1,0,1080.0
1188909,18842,3707,1,0,0,656.0
1188910,32732,9198,1,0,0,1307.0


In [44]:
sample_a['item_sum'] = sample_a['purchase'] * sample_a['item_price']
sample_a.drop(columns='item_price', axis=1, inplace=True)  
sample_a

Unnamed: 0,user_id,item_id,viewing,click,purchase,item_sum
0,84636,360,1,0,0,0.0
1,21217,9635,1,0,0,0.0
2,13445,8590,1,0,0,0.0
3,38450,5585,1,0,0,0.0
4,14160,2383,0,1,0,0.0
...,...,...,...,...,...,...
1188907,22999,2401,1,0,0,0.0
1188908,23700,4654,0,1,0,0.0
1188909,18842,3707,1,0,0,0.0
1188910,32732,9198,1,0,0,0.0
