In [3]:
import numpy as np
import pandas as pd
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
    HuberRegressor,
    LinearRegression,
    RANSACRegressor,
    TheilSenRegressor,
)
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(42)

Ваши задачи следующие:
1. Проанализировать данные, которые вам попались. Какие выводы можно сделать только из анализа? Проведите визуальный и статический анализ, опишите ваши находки.
2. Определите, какое моделирование доступно для ваших данных. Какие есть в них связи? 
3. Опишите возможные верхнеуровневые задачи, которые можно решать с помощью таких данных.
4. Очистите и обработайте данные. Опишите, какие в них были проблемы.
5. Постройте подходящую предсказательную модель. Обоснуйте ее выбор.
6. Выберите и обоснуйте метрики для ее оценки. Покажите, как модель работает на данных, которые не видела при обучении.
7. Представьте, что решаете реальную бизнес-задачу. Как вы могли бы использовать вашу модель? Какие бизнес-метрики могли бы быть использованы для ее контроля?

**Чистка**

In [4]:
data_path = '../../data/dirty_cafe_sales.csv'

In [5]:
df = pd.read_csv(data_path)
df = pd.DataFrame(df)
# print(df.head(),'\n')
print(df.info(),'\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    10000 non-null  object
 1   Item              9667 non-null   object
 2   Quantity          9862 non-null   object
 3   Price Per Unit    9821 non-null   object
 4   Total Spent       9827 non-null   object
 5   Payment Method    7421 non-null   object
 6   Location          6735 non-null   object
 7   Transaction Date  9841 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB
None 



In [6]:
items = df['Item'].unique()
quantity = df['Quantity'].unique()
ppu = df['Price Per Unit'].unique()
total_spent = df['Total Spent'].unique()
payment_method = df['Payment Method'].unique()
location = df['Location'].unique()
date = df['Transaction Date'].unique()
items

array(['Coffee', 'Cake', 'Cookie', 'Salad', 'Smoothie', 'UNKNOWN',
       'Sandwich', nan, 'ERROR', 'Juice', 'Tea'], dtype=object)

*** Данные грязные. Исправим это. ***

In [7]:
print("CHECK ITEM PRICES\n")

print("..Find default prices for each item\n")
for item in items:
    print(item)
    print(df[df['Item'] == item]['Price Per Unit'].unique())
    
print('...unique items BEFORE:', items)
df.loc[df['Item'] == 'Coffee', 'Price Per Unit'] = 2.0
df.loc[df['Item'] == 'Cake', 'Price Per Unit'] = 3.0
df.loc[df['Item'] == 'Cookie', 'Price Per Unit'] = 1.0
df.loc[df['Item'] == 'Salad', 'Price Per Unit'] = 5.0
df.loc[df['Item'] == 'Smoothie', 'Price Per Unit'] = 4.0
df.loc[df['Item'] == 'Juice', 'Price Per Unit'] = 3.0
df.loc[df['Item'] == 'Tea', 'Price Per Unit'] = 1.5
df.loc[df['Item'] == 'Sandwich', 'Price Per Unit'] = 4.0

items = df['Item'].unique()
print('...unique items AFTER:', items)



CHECK ITEM PRICES

..Find default prices for each item

Coffee
['2.0' nan 'ERROR' 'UNKNOWN']
Cake
['3.0' nan 'UNKNOWN' 'ERROR']
Cookie
['1.0' 'UNKNOWN' nan 'ERROR']
Salad
['5.0' 'ERROR' 'UNKNOWN' nan]
Smoothie
['4.0' nan 'UNKNOWN' 'ERROR']
UNKNOWN
['3.0' '1.0' '5.0' '4.0' '1.5' '2.0' nan 'UNKNOWN' 'ERROR']
Sandwich
['4.0' nan 'ERROR' 'UNKNOWN']
nan
[]
ERROR
['1.5' '3.0' '5.0' nan '4.0' '2.0' '1.0' 'UNKNOWN' 'ERROR']
Juice
['3.0' nan 'UNKNOWN' 'ERROR']
Tea
['1.5' nan 'ERROR' 'UNKNOWN']
...unique items BEFORE: ['Coffee' 'Cake' 'Cookie' 'Salad' 'Smoothie' 'UNKNOWN' 'Sandwich' nan
 'ERROR' 'Juice' 'Tea']
...unique items AFTER: ['Coffee' 'Cake' 'Cookie' 'Salad' 'Smoothie' 'UNKNOWN' 'Sandwich' nan
 'ERROR' 'Juice' 'Tea']


In [8]:
print("Let Price PEr Unit = error, unknown, nan will be replaced with zero\n")

print('...unique price per unit BEFORE:', ppu)
df['Price Per Unit'] = df['Price Per Unit'].replace('ERROR', 0)
df['Price Per Unit'] = df['Price Per Unit'].replace('UNKNOWN', 0)
df['Price Per Unit'] = df['Price Per Unit'].replace(np.nan, 0)

ppu = df['Price Per Unit'].unique()
print('...unique price per unit AFTER:', ppu)


Let Price PEr Unit = error, unknown, nan will be replaced with zero

...unique price per unit BEFORE: ['2.0' '3.0' '1.0' '5.0' '4.0' '1.5' nan 'ERROR' 'UNKNOWN']
...unique price per unit AFTER: [2.0 3.0 1.0 5.0 4.0 '3.0' '1.5' '2.0' '1.0' 1.5 '5.0' 0 '4.0']


In [9]:
print("Let TotalSpent = error, unknown, nan will be replaced with zero\n")
print('...unique total spent BEFORE:', total_spent)

df['Total Spent'] = df['Total Spent'].replace('ERROR', 0.0)
df['Total Spent'] = df['Total Spent'].replace('UNKNOWN', 0.0)
df['Total Spent'] = df['Total Spent'].replace(np.nan, 0.0)
total_spent = df['Total Spent'].unique()

print('...unique total spent AFTER:', total_spent)

Let TotalSpent = error, unknown, nan will be replaced with zero

...unique total spent BEFORE: ['4.0' '12.0' 'ERROR' '10.0' '20.0' '9.0' '16.0' '15.0' '25.0' '8.0' '5.0'
 '3.0' '6.0' nan 'UNKNOWN' '2.0' '1.0' '7.5' '4.5' '1.5']
...unique total spent AFTER: ['4.0' '12.0' 0.0 '10.0' '20.0' '9.0' '16.0' '15.0' '25.0' '8.0' '5.0'
 '3.0' '6.0' '2.0' '1.0' '7.5' '4.5' '1.5']


In [10]:
print('...unique quantity BEFORE:', quantity)
df['Quantity'] = df['Quantity'].replace('ERROR', 1)
df['Quantity'] = df['Quantity'].replace('UNKNOWN', 1)
df['Quantity'] = df['Quantity'].replace(np.nan, 1)
quantity = df['Quantity'].unique()

print('...unique quantity AFTER:', quantity)


...unique quantity BEFORE: ['2' '4' '5' '3' '1' 'ERROR' 'UNKNOWN' nan]
...unique quantity AFTER: ['2' '4' '5' '3' '1' 1]


In [11]:
df['Quantity'] = df['Quantity'].astype(np.float32)
df['Price Per Unit'] = df['Price Per Unit'].astype(np.float32)
df['Total Spent'] = df['Total Spent'].astype(np.float32)

total_spent - общая сумма потраченных денег == quantity * price

In [12]:
# check if total_spent = price_per_unit * quantity

wrong = df[df['Total Spent'] != df['Price Per Unit'] * df['Quantity']]
# print("...wrong total spent:\n", wrong.head(5))

# fix it
df.loc[df['Total Spent'] == 0.0, 'Total Spent'] = df['Quantity'] * df['Price Per Unit']
print(df[df['Total Spent'] == 0.0])


     Transaction ID     Item  Quantity  Price Per Unit  Total Spent  \
1761    TXN_3611851      NaN       4.0             0.0          0.0   
2289    TXN_7524977  UNKNOWN       4.0             0.0          0.0   
4152    TXN_9646000    ERROR       2.0             0.0          0.0   

     Payment Method  Location Transaction Date  
1761    Credit Card       NaN       2023-02-09  
2289          ERROR       NaN       2023-12-09  
4152            NaN  In-store       2023-12-14  


у нас все еще беда с item == nan, error, unknown. Мы можем после правильного определения ppu, quantity, price найти сходства с обычными товарами и заполнить их.

In [13]:
print(df[df['Item'] == 'UNKNOWN'].shape)
print(df[df['Item'] == 'ERROR'].shape)
print(df[df['Item'] == np.nan].shape)
print(df[df['Item'].isna()].shape)

print(df.shape)


# # delete all rows with items = unknown, error, nan
# df = df.drop(df[df['Item'] == 'UNKNOWN'].index)
# df = df.drop(df[df['Item'] == 'ERROR'].index)
# df = df.drop(df[df['Item'] == np.nan].index)
# df = df.drop(df[df['Item'].isna()].index)
# print('\n')
# print(df[df['Item'] == 'UNKNOWN'].shape)
# print(df[df['Item'] == 'ERROR'].shape)
# print(df[df['Item'] == np.nan].shape)
# print(df[df['Item'].isna()].shape)
# print(df.shape)


(344, 8)
(292, 8)
(0, 8)
(333, 8)
(10000, 8)


In [14]:
# count of nan
print(df.isna().sum())

Transaction ID         0
Item                 333
Quantity               0
Price Per Unit         0
Total Spent            0
Payment Method      2579
Location            3265
Transaction Date     159
dtype: int64


In [15]:
# не забываем, что мы ppu, quantity, total_spent заменили на 0.0, 1.0, 0.0

# Используем .loc вместо replace для замены значений:
df.loc[df['Quantity'] == 1, 'Quantity'] = df.loc[df['Quantity'] == 1, 'Total Spent'] / df.loc[df['Quantity'] == 1, 'Price Per Unit']
df.loc[df['Total Spent'] == 0.0, 'Total Spent'] = df.loc[df['Total Spent'] == 0.0, 'Quantity'] * df.loc[df['Total Spent'] == 0.0, 'Price Per Unit']
df.loc[df['Price Per Unit'] == 0.0, 'Price Per Unit'] = df.loc[df['Price Per Unit'] == 0.0, 'Total Spent'] / df.loc[df['Price Per Unit'] == 0.0, 'Quantity']

wrong = df[df['Total Spent'] != df['Price Per Unit'] * df['Quantity']]
print("...wrong:\n", wrong)

...wrong:
      Transaction ID     Item  Quantity  Price Per Unit  Total Spent  \
818     TXN_7940202      NaN       inf             0.0          4.0   
3434    TXN_6457997  UNKNOWN       inf             0.0          4.0   
3779    TXN_7376255  UNKNOWN       inf             0.0         25.0   
4092    TXN_1840897  UNKNOWN       inf             0.0          5.0   
7597    TXN_1082717    ERROR       inf             0.0          9.0   
9382    TXN_4255580    ERROR       inf             0.0          3.0   
9673    TXN_2480808  UNKNOWN       inf             0.0          4.0   
9717    TXN_3334632      NaN       inf             0.0          2.0   
9819    TXN_1208561      NaN       inf             0.0         20.0   

      Payment Method  Location Transaction Date  
818   Digital Wallet       NaN       2023-07-23  
3434     Credit Card       NaN       2023-12-12  
3779             NaN  In-store       2023-05-27  
4092           ERROR       NaN       2023-06-03  
7597  Digital Wallet  In-sto

In [16]:
# удалим строчки с quantity = inf
df = df.drop(df[df['Quantity'] == np.inf].index)
wrong = df[df['Total Spent'] != df['Price Per Unit'] * df['Quantity']]
print("...wrong:\n", wrong)

...wrong:
 Empty DataFrame
Columns: [Transaction ID, Item, Quantity, Price Per Unit, Total Spent, Payment Method, Location, Transaction Date]
Index: []


In [17]:
print(df.isna().sum())

Transaction ID         0
Item                 330
Quantity               0
Price Per Unit         0
Total Spent            0
Payment Method      2578
Location            3259
Transaction Date     158
dtype: int64


In [18]:
print(df[df['Total Spent'] == 0.0])
print('\n')
print(df[df['Price Per Unit'] == 0.0])

df = df.drop(df[df['Total Spent'] == 0.0].index)
print(df[df['Total Spent'] == 0.0])
print('\n')
print(df[df['Price Per Unit'] == 0.0])


     Transaction ID     Item  Quantity  Price Per Unit  Total Spent  \
1761    TXN_3611851      NaN       4.0             0.0          0.0   
2289    TXN_7524977  UNKNOWN       4.0             0.0          0.0   
4152    TXN_9646000    ERROR       2.0             0.0          0.0   

     Payment Method  Location Transaction Date  
1761    Credit Card       NaN       2023-02-09  
2289          ERROR       NaN       2023-12-09  
4152            NaN  In-store       2023-12-14  


     Transaction ID     Item  Quantity  Price Per Unit  Total Spent  \
1761    TXN_3611851      NaN       4.0             0.0          0.0   
2289    TXN_7524977  UNKNOWN       4.0             0.0          0.0   
4152    TXN_9646000    ERROR       2.0             0.0          0.0   

     Payment Method  Location Transaction Date  
1761    Credit Card       NaN       2023-02-09  
2289          ERROR       NaN       2023-12-09  
4152            NaN  In-store       2023-12-14  
Empty DataFrame
Columns: [Transactio

In [19]:
for item in items:
    print(item)
    print(df[df['Item'] == item]['Price Per Unit'].unique())

print('NaN')
print(df[df['Item'].isna()]['Price Per Unit'].unique())

Coffee
[2.]
Cake
[3.]
Cookie
[1.]
Salad
[5.]
Smoothie
[4.]
UNKNOWN
[3.  1.  5.  4.  1.5 2. ]
Sandwich
[4.]
nan
[]
ERROR
[1.5 3.  5.  4.  2.  1. ]
Juice
[3.]
Tea
[1.5]
NaN
[3.  2.  1.  5.  4.  1.5]


!! Cake и Juice имеют цену 3.00; Sandwich, Smoothie –  4.00.

In [20]:
df.loc[df['Price Per Unit'] == 1.0, 'Item'] = 'Cookie'
df.loc[df['Price Per Unit'] == 5.0, 'Item'] = 'Salad'
df.loc[df['Price Per Unit'] == 2.0, 'Item'] = 'Coffee'
df.loc[df['Price Per Unit'] == 1.5, 'Item'] = 'Tea'

# for item in items:
#     print(item)
#     print(df[df['Item'] == item]['Price Per Unit'].unique())

# print('NaN')
# print(df[df['Item'].isna()]['Price Per Unit'].unique())

In [21]:
idx1 = 0
idx2 = 0
print(df[df['Item'] == 'UNKNOWN'].shape)
for i in df.index:
    if df.loc[i, 'Item'] == 'UNKNOWN' or df.loc[i,'Item'] == 'ERROR' or pd.isna(df.loc[i,'Item']):
        if df.loc[i,'Price Per Unit'] == 3.0:
            df.loc[i, 'Item'] = 'Juice' if idx1 % 2 == 0 else 'Cake'
            idx1 += 1
        elif df.loc[i, 'Price Per Unit'] == 4.0:
            df.loc[i, 'Item'] = 'Smoothie' if idx2 % 2 == 0 else 'Sandwich'
            idx2 += 1
        else:
            raise ValueError(f"Unknown price per unit: {df.loc[i,'Price Per Unit']} for row {i}")
        
items = df['Item'].unique()
    
for item in items:
    print(item)
    print(df[df['Item'] == item]['Price Per Unit'].unique())

print('NaN')
print(df[df['Item'].isna()]['Price Per Unit'].unique())

(155, 8)
Coffee
[2.]
Cake
[3.]
Cookie
[1.]
Salad
[5.]
Smoothie
[4.]
Juice
[3.]
Sandwich
[4.]
Tea
[1.5]
NaN
[]


In [22]:
print(df.isna().sum())

Transaction ID         0
Item                   0
Quantity               0
Price Per Unit         0
Total Spent            0
Payment Method      2577
Location            3257
Transaction Date     158
dtype: int64


Остались payment method, location, transaction date

In [23]:
print(df['Payment Method'].unique())
print(df[df['Payment Method'] == 'UNKNOWN'].shape)
print(df[df['Payment Method'] == 'ERROR'].shape)
print(df[df['Payment Method'] == np.nan].shape)
print(df[df['Payment Method'].isna()].shape)

['Credit Card' 'Cash' 'UNKNOWN' 'Digital Wallet' 'ERROR' nan]
(293, 8)
(304, 8)
(0, 8)
(2577, 8)


Раз уже так много нанов в payment method, то можем предположить, что это все cash. Ну а error - digital, unknown - card

In [24]:
df.loc[df['Payment Method'] == 'UNKNOWN', 'Payment Method'] = 'Credit Card'
df.loc[df['Payment Method'] == 'ERROR', 'Payment Method'] = 'Digital Wallet'
df.loc[df['Payment Method'].isna(), 'Payment Method'] = 'Cash'
print(df['Payment Method'].unique())

['Credit Card' 'Cash' 'Digital Wallet']


In [25]:
print(df['Location'].unique())
print(df[df['Location'] == 'Takeaway'].shape)
print(df[df['Location'] == 'In-store'].shape)
print(df[df['Location'] == 'UNKNOWN'].shape)
print(df[df['Location'] == 'ERROR'].shape)
print(df[df['Location'] == np.nan].shape)
print(df[df['Location'].isna()].shape)

['Takeaway' 'In-store' 'UNKNOWN' nan 'ERROR']
(3021, 8)
(3014, 8)
(338, 8)
(358, 8)
(0, 8)
(3257, 8)


с location,думаю, можно поступить так же, как с cake, juice, sandwich, smoothie.

In [26]:
idx = 0
for i in df.index:
    if df.loc[i, 'Location'] == 'UNKNOWN' or df.loc[i,'Location'] == 'ERROR' or pd.isna(df.loc[i,'Location']):
        df.loc[i, 'Location'] = 'In-store' if idx % 2 == 0 else 'Takeaway'
        idx += 1
print(df['Location'].unique())

['Takeaway' 'In-store']


Осталась transaction date.

In [27]:
print(df[df['Transaction Date'] == 'UNKNOWN'].shape,
df[df['Transaction Date'] == 'ERROR'].shape,    
df[df['Transaction Date'] == np.nan].shape,
df[df['Transaction Date'].isna()].shape,)

dates = df['Transaction Date'].unique().astype(str).tolist()
dates.sort()
dates = dates[:len(dates)-3] # remove nan, error, unknown

rs = np.random.MT19937(int(time.time()))

for i in df.index:
    if df.loc[i, 'Transaction Date'] == 'UNKNOWN' or df.loc[i,'Transaction Date'] == 'ERROR' or pd.isna(df.loc[i,'Transaction Date']):
        df.loc[i, 'Transaction Date'] = np.random.Generator(rs).choice(dates)
        
print(df[df['Transaction Date'] == 'UNKNOWN'].shape,
df[df['Transaction Date'] == 'ERROR'].shape,    
df[df['Transaction Date'] == np.nan].shape,
df[df['Transaction Date'].isna()].shape,)

(159, 8) (142, 8) (0, 8) (158, 8)
(0, 8) (0, 8) (0, 8) (0, 8)


In [28]:
df.isna().sum()

Transaction ID      0
Item                0
Quantity            0
Price Per Unit      0
Total Spent         0
Payment Method      0
Location            0
Transaction Date    0
dtype: int64

In [29]:
# save clean df
data_path = '../../data/cleaned_cafe_sales_clean.csv'

df.to_csv(data_path, index=False)