In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [68]:
# Считываем данные из Excel в DataFrame
data_tsum = pd.read_excel("data/Data_TSUM.xlsx", sheet_name=None)

# Предполагая, что листы называются 'Data_Parsing' и 'Data_Company'
data_parsing = data_tsum['Data_Parsing']
data_company = data_tsum['Data_Company']
display(data_company)

Unnamed: 0,brand,Category,item_id,color_id,current price
0,Valentino,Shoes,aaaaa111111,black,247
1,Valentino,Shoes,aaaaa111112,black,161
2,Valentino,Shoes,aaaaa111113,black,234
3,Valentino,Shoes,aaaaa111114,black,167
4,Valentino,Shoes,aaaaa111115,black,153
...,...,...,...,...,...
70,Stone Island,Bags,sssss111131,red,165
71,Stone Island,Bags,sssss111132,red,196
72,Stone Island,Bags,sssss111133,red,236
73,Stone Island,Bags,sssss111134,red,222


In [73]:

# Удаляем указанные символы из столбца producer_color
data_parsing['producer_id'] = data_parsing['producer_id'].replace(['_', '-', '~', "", '/'], '', regex=True)

display(data_parsing)


Unnamed: 0,brand,Category,producer_id,producer_color,price
0,Valentino,Shoes,aaaaa111111,black,167
1,Valentino,Shoes,aaaaa111112,black,188
2,Valentino,Shoes,aaaaa111113,black,184
3,Valentino,Shoes,aaaaa111114,bla//ck,196
4,Valentino,Shoes,aaaaa111115,bla\\ck,250
...,...,...,...,...,...
70,Stone Island,Bags,sssss111131,~~~red,164
71,Stone Island,Bags,sssss111132,~~~red,158
72,Stone Island,Bags,sssss111133,~~~red,194
73,Stone Island,Bags,sssss111134,~~~red,256


In [70]:
# Преобразуем поля в базе компании для соответствия полям в базе парсинга
data_company.rename(columns={'item_id': 'producer_id', 'color_id': 'producer_color',}, inplace=True)

# Объединяем базы данных по producer_id и producer_color
merged_data = pd.merge(data_parsing, data_company, 
                       on=['producer_id'], how='left')

display(merged_data)

Unnamed: 0,brand_x,Category_x,producer_id,producer_color_x,price,brand_y,Category_y,producer_color_y,current price
0,Valentino,Shoes,aaaaa111111,black,167,Valentino,Shoes,black,247
1,Valentino,Shoes,aaaaa111112,black,188,Valentino,Shoes,black,161
2,Valentino,Shoes,aaaaa111113,black,184,Valentino,Shoes,black,234
3,Valentino,Shoes,aaaaa111114,bla//ck,196,Valentino,Shoes,black,167
4,Valentino,Shoes,aaaaa111115,bla\\ck,250,Valentino,Shoes,black,153
...,...,...,...,...,...,...,...,...,...
70,Stone Island,Bags,sssss111131,~~~red,164,Stone Island,Bags,red,165
71,Stone Island,Bags,sssss111132,~~~red,158,Stone Island,Bags,red,196
72,Stone Island,Bags,sssss111133,~~~red,194,Stone Island,Bags,red,236
73,Stone Island,Bags,sssss111134,~~~red,256,Stone Island,Bags,red,222


In [71]:
# Вычисляем столбец разницы цен в %
merged_data['price difference percent'] = round((((merged_data['current price'] - merged_data['price']) / merged_data['price']) * 100) ,2)

display(merged_data)

Unnamed: 0,brand_x,Category_x,producer_id,producer_color_x,price,brand_y,Category_y,producer_color_y,current price,price difference percent
0,Valentino,Shoes,aaaaa111111,black,167,Valentino,Shoes,black,247,47.90
1,Valentino,Shoes,aaaaa111112,black,188,Valentino,Shoes,black,161,-14.36
2,Valentino,Shoes,aaaaa111113,black,184,Valentino,Shoes,black,234,27.17
3,Valentino,Shoes,aaaaa111114,bla//ck,196,Valentino,Shoes,black,167,-14.80
4,Valentino,Shoes,aaaaa111115,bla\\ck,250,Valentino,Shoes,black,153,-38.80
...,...,...,...,...,...,...,...,...,...,...
70,Stone Island,Bags,sssss111131,~~~red,164,Stone Island,Bags,red,165,0.61
71,Stone Island,Bags,sssss111132,~~~red,158,Stone Island,Bags,red,196,24.05
72,Stone Island,Bags,sssss111133,~~~red,194,Stone Island,Bags,red,236,21.65
73,Stone Island,Bags,sssss111134,~~~red,256,Stone Island,Bags,red,222,-13.28


In [79]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

# Применим эту функцию к таблице sber_data и признаку mkad_km, а также выведем размерности результатов:

outliers, cleaned = outliers_z_score(merged_data, 'price difference percent', log_scale=True)
print(f'Число выбросов по методу z-отклонения: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

Число выбросов по методу z-отклонения: 1
Результирующее число записей: 39


In [80]:
# создаем столбец с указание выбросов
# Создаем копию базы данных
tsum_copy = data_tsum.copy()

# Cоздание столбца с указанием выбросов
tsum_copy['outliers'] = tsum_copy['difference']
tsum_copy['outliers'] = tsum_copy['outliers'].apply(lambda x: 'Yes' if x == outliers['difference'].unique() else 'No')
display(tsum_copy)

tsum_copy.to_csv('data/Modified_data_tsum.csv')


KeyError: 'difference'

In [82]:
# Чтение файлов
parsing = pd.read_excel('data/Data_TSUM.xlsx', sheet_name='Data_Parsing')
company = pd.read_excel('data/Data_TSUM.xlsx', sheet_name='Data_Company')

# Объединяем базы данных
tsum_df = pd.concat([parsing, company[['item_id','color_id','current price']]], axis = 1, join = 'inner', )

# формируем столбец разницы цен в % 
tsum_df['difference'] = abs((tsum_df['price'] - tsum_df['current price']) / tsum_df['current price'] * 100)

# Определяем выбросы по методу z-отклонений 
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

outliers, cleaned = outliers_z_score(tsum_df, 'difference', log_scale=True)
print(f'Число выбросов по методу z-отклонения: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

# Создаем копию базы данных
Data_tsum = tsum_df.copy()

# Cоздание столбца с указанием выбросов
Data_tsum['outliers'] = Data_tsum['difference']
Data_tsum['outliers'] = Data_tsum['outliers'].apply(lambda x: 'Yes' if x == outliers['difference'].unique() else 'No')
display(Data_tsum)

Data_tsum.to_csv('data/Modified_data_tsum.csv')

Число выбросов по методу z-отклонения: 1
Результирующее число записей: 74


Unnamed: 0,brand,Category,producer_id,producer_color,price,item_id,color_id,current price,difference,outliers
0,Valentino,Shoes,aaaaa1111_11,black,167,aaaaa111111,black,247,32.388664,No
1,Valentino,Shoes,aaaaa1111_12,black,188,aaaaa111112,black,161,16.770186,No
2,Valentino,Shoes,aaaaa1111_13,black,184,aaaaa111113,black,234,21.367521,No
3,Valentino,Shoes,aaaaa1111_14,bla//ck,196,aaaaa111114,black,167,17.365269,No
4,Valentino,Shoes,aaaaa1111_15,bla\\ck,250,aaaaa111115,black,153,63.398693,No
...,...,...,...,...,...,...,...,...,...,...
70,Stone Island,Bags,sssss1111_31,~~~red,164,sssss111131,red,165,0.606061,Yes
71,Stone Island,Bags,sssss1111_32,~~~red,158,sssss111132,red,196,19.387755,No
72,Stone Island,Bags,sssss1111_33,~~~red,194,sssss111133,red,236,17.796610,No
73,Stone Island,Bags,sssss1111_34,~~~red,256,sssss111134,red,222,15.315315,No
