# Прогнозирование будущих продаж

### Митрофанов Андрей ВМК 104

# 0. Инициализация ноутбука

Загрузка необходимых библиотек

In [18]:
# Код реализован на версии Python 3.10.4
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
import copy

In [19]:
# Функция get_rainfall производит отобрадение описания осадков на числовую ось 
def get_rainfall(weather: str):
    match weather:
        case 'метель':
            return 1
        case 'гроза':
            return 1
        case 'дождь':
            return 0.75
        case 'снег':
            return 0.75
        case 'осадки':
            return 0.75
        case 'небольшой дождь':
            return 0.25
        case 'небольшой снег':
            return 0.25
        case 'небольшие осадки':
            return 0.25
        case 'без существенных осадков':
            return 0.1
    return 0
            

In [20]:
def preprocess_data(data: pd.DataFrame):
  data_transformed = copy.deepcopy(data)
  # Для начала произведём обработку значений данных.
  categorical_features = ['weekend','city_name', 'product_id', 'store_id']
  encoding_features = ['weather_desc']	# ['city_name']

  # Преобразуем дату с помощью библиотеки pandas
  data_transformed['date'] = pd.to_datetime(data_transformed['date'], format='%Y-%m-%d')
  # Добавим новый признак выходного дня 
  data_transformed['weekend'] = data_transformed['date'].apply(
    lambda date:  1 if date.dayofweek > 4 else 0
  ) 
  # Переведёт дату в числовой формат
  data_transformed['date'] = data_transformed['date'].apply(
    lambda date: date.year + date.day_of_year / 365
  )

  # Погоду можно разделить на атмосферное состояние и осадки, 
  # где осадки можно выразить от 0 до 1,
  # состояние можно использовать в качестве категориального признака
  weather_list = [[],[]]
  for weather_desc in data_transformed['weather_desc']:
    weatherList = weather_desc.split(', ')
    if len(weatherList) < 2:
      weatherList.append(weatherList[0])
    weatherList[1] = get_rainfall(weatherList[1])
    weather_list[0].append(weatherList[0])
    weather_list[1].append(weatherList[1])
  
  data_transformed = data_transformed.drop(columns='weather_desc')
  data_transformed.insert(7,'weather_desc', weather_list[0], allow_duplicates=True)
  data_transformed.insert(8,'rainfall', weather_list[1], allow_duplicates=True)

  # Кодирование числовыми лейблами
  encoder = preprocessing.LabelEncoder()
  for feature in encoding_features:
    data_transformed[feature] = encoder.fit_transform(data_transformed[feature])

  # Преобразуем категориальные признаки с помощью one-hot encoding.
  data_transformed = pd.get_dummies(data_transformed, columns=categorical_features)

  # Теперь необходимо пронормировать данные.
  # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
  # scaled = scaler.fit_transform(data_transformed)
  # data_transformed = pd.DataFrame(
  #   scaled, 
  #   columns=data_transformed.columns, 
  #   index=pd.RangeIndex(start=1, stop=len(data_transformed)+1, name='id')
  # )
  return data_transformed

Загрузка данных

In [21]:
data_train = pd.read_csv('./data/train.csv', index_col=0)
data_train.head()

Unnamed: 0_level_0,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2021-07-29,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",61.9375,23.1875,741.0,26
2,2021-07-30,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",70.25,22.1875,740.3125,37
3,2021-07-31,Москва,1,1,1,4.79,переменная облачность,52.625,21.8125,741.625,25
4,2021-08-01,Москва,1,1,1,4.79,"облачно, небольшой дождь",87.4375,20.0625,743.3125,26
5,2021-08-02,Москва,1,1,1,4.79,переменная облачность,66.1875,23.4375,739.625,22


In [22]:
data_test = pd.read_csv('./data/test.csv', index_col=0)
data_test.head()

Unnamed: 0_level_0,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
666677,2022-02-14,Москва,1,1,1,4.79,облачно,87.3125,-1.9375,749.3125
666678,2022-02-15,Москва,1,1,1,4.79,переменная облачность,88.75,-1.25,752.6875
666679,2022-02-16,Москва,1,1,1,4.79,переменная облачность,90.375,-1.5625,746.3125
666680,2022-02-17,Москва,1,1,1,4.79,"облачно, небольшой дождь",98.0,1.75,732.6875
666681,2022-02-18,Москва,1,1,1,4.79,"облачно, небольшие осадки",95.5,1.375,733.0


In [23]:
y_true = pd.read_csv('./data/sample_submission.csv', index_col=0)

In [24]:
X_train = data_train.drop(columns='sales')
y_train = data_train['sales']

X_train = preprocess_data(X_train)
X_train.head()

Unnamed: 0_level_0,date,category_id,price,humidity,weather_desc,rainfall,temperature,pressure,weekend_0,weekend_1,...,store_id_153,store_id_154,store_id_156,store_id_157,store_id_158,store_id_159,store_id_160,store_id_162,store_id_163,store_id_164
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2021.575342,1,4.79,61.9375,4,0.25,23.1875,741.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2021.578082,1,4.79,70.25,4,0.25,22.1875,740.3125,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2021.580822,1,4.79,52.625,4,0.0,21.8125,741.625,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2021.583562,1,4.79,87.4375,2,0.25,20.0625,743.3125,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2021.586301,1,4.79,66.1875,4,0.0,23.4375,739.625,1,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X_test = preprocess_data(data_test)
X_test.head()

Unnamed: 0_level_0,date,category_id,price,humidity,weather_desc,rainfall,temperature,pressure,weekend_0,weekend_1,...,store_id_153,store_id_154,store_id_156,store_id_157,store_id_158,store_id_159,store_id_160,store_id_162,store_id_163,store_id_164
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
666677,2022.123288,1,4.79,87.3125,1,0.0,-1.9375,749.3125,1,0,...,0,0,0,0,0,0,0,0,0,0
666678,2022.126027,1,4.79,88.75,3,0.0,-1.25,752.6875,1,0,...,0,0,0,0,0,0,0,0,0,0
666679,2022.128767,1,4.79,90.375,3,0.0,-1.5625,746.3125,1,0,...,0,0,0,0,0,0,0,0,0,0
666680,2022.131507,1,4.79,98.0,1,0.25,1.75,732.6875,1,0,...,0,0,0,0,0,0,0,0,0,0
666681,2022.134247,1,4.79,95.5,1,0.25,1.375,733.0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
model = LinearRegression(n_jobs=3)
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)
mae = mean_absolute_error(y_true=y_true, y_pred=y_predicted)

print(f'Score: {model.score(X_train, y_train)}\nMAE: {mae}')

Score: 0.5855786304425925
MAE: 7.107877363001289


In [10]:
model = Ridge()
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)
mae = mean_absolute_error(y_true=y_true, y_pred=y_predicted)

print(f"Score: {model.score(X_train, y_train)}\nMAE: {mae}")

Score: 0.5855973163540463
MAE: 7.104970563571778


In [11]:
model = Lasso()
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)
mae = mean_absolute_error(y_true=y_true, y_pred=y_predicted)

print(f'Score: {model.score(X_train, y_train)}\nMAE: {mae}')

Score: 0.3497020542709175
MAE: 8.088870397512853


In [17]:
model = SGDRegressor(
    penalty = 'elasticnet', 
    alpha = 0.0001, 
    l1_ratio = 0.5,
    max_iter = 10000,
    epsilon = 0.0001,
    random_state = 42
)
model.fit(X_train, y_train)

y_predicted = model.predict(X_test)
mae = mean_absolute_error(y_true=y_true, y_pred=y_predicted)

print(f'Score: {model.score(X_train, y_train)}\nMAE: {mae}')

Score: 0.5813922360420023
MAE: 5.736786354064599


In [98]:
y_predicted =  pd.DataFrame(
    y_predicted, 
    columns=['sales'], 
    index=y_true.index
)

y_predicted.to_csv("./out/prediction.csv")