In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# ファイルの読み込み
train_df = pd.read_csv("sales_train.csv")
test_df = pd.read_csv("test.csv")
items_df = pd.read_csv("items.csv")
item_categories_df = pd.read_csv("item_categories.csv")
shops_df = pd.read_csv("shops.csv")

In [3]:
# ざっくり中身を確認
print("train")
train_df.head()

train


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
print("test")
test_df.head()

test


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [5]:
print("items")
items_df.head()

items


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [6]:
print("item_categories")
item_categories_df.head()

item_categories


Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [7]:
print("shops")
shops_df.head()

shops


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [8]:
# train と test を統一フォーマットにして、「まとめて特徴量エンジニアリング」しやすくする！

# 最終月の確認(test の date_block_num に使う)
train_df["date_block_num"].max()

33

In [9]:
# ステップ2：test に必要なカラムを追加して、train と同じ形式に揃える

# test に同じカラムを追加
test_df["date_block_num"] = 34
test_df["date"] = np.nan #日付は不明なのでNANにしておく
test_df["item_price"] = np.nan
test_df["item_cnt_day"] = np.nan

In [10]:
# カラムの並び順を train_df に合わせる
test_df = test_df[train_df.columns]

In [11]:
# 確認
test_df.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day'],
      dtype='object')

In [12]:
#  ステップ3：train と test を縦に結合

# 結合して all_data にまとめる
all_data = pd.concat([train_df, test_df], ignore_index=True)

In [13]:
# 確認
print("all_data shape:", all_data.shape)
print("train_shape:", train_df.shape)
print("test_df shape", test_df.shape)

all_data shape: (3150049, 6)
train_shape: (2935849, 6)
test_df shape (214200, 6)


In [14]:
# 欠損値の数を確認
missing_counts = all_data.isnull().sum()
print(missing_counts[missing_counts > 0])

date            214200
item_price      214200
item_cnt_day    214200
dtype: int64


In [15]:
# 欠損値の処理
# dateは使わないからこのままでもOK（後で削除してもいい）
# item_price, item_cnt_day も学習時は test 抜くから OK

# ただし、あとで train と test に再分割する前に一応 fillna しておいてもOK
all_data['item_price'] = all_data['item_price'].fillna(-1)
all_data['item_cnt_day'] = all_data['item_cnt_day'].fillna(-1)

In [16]:
# 確認
print(all_data.isnull().sum())

date              214200
date_block_num         0
shop_id                0
item_id                0
item_price             0
item_cnt_day           0
dtype: int64


In [17]:
# 第4章：特徴量追加編（基礎編）

# 商品IDにカテゴリIDを付けるためにマージ
all_data = all_data.merge(items_df[['item_id', 'item_category_id']], on='item_id', how='left')

In [18]:
# 店舗名を使う場合、不要な記号や表記ゆれを消すとGood!!
shops_df['shop_name_cleaned'] = shops_df['shop_name'].str.lower().str.replace(r'[^\w\s]','', regex=True)

In [19]:
# そして、shop_id 経由でマージ
all_data = all_data.merge(shops_df[['shop_id', 'shop_name_cleaned']], on='shop_id', how='left')

In [20]:
# 売る上げ金額を追加（testには item_price も item_cnt_day も NaN）
all_data['revenue'] = all_data['item_price'] * all_data['item_cnt_day']

In [21]:
# 確認
all_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,shop_name_cleaned,revenue
0,02.01.2013,0,59,22154,999.0,1.0,37,ярославль тц альтаир,999.0
1,03.01.2013,0,25,2552,899.0,1.0,58,москва трк атриум,899.0
2,05.01.2013,0,25,2552,899.0,-1.0,58,москва трк атриум,-899.0
3,06.01.2013,0,25,2554,1709.05,1.0,58,москва трк атриум,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,56,москва трк атриум,1099.0


In [22]:
# 月ごとの売上数を集計(target変数)
group = all_data.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day' : 'sum'}).reset_index()

# カラム名を変更する
group.rename(columns={'item_cnt_day' : 'monthly_shop_item_cnt'}, inplace=True)

# 結合
all_data = all_data.merge(group,  on=['date_block_num', 'shop_id', 'item_id'], how='left')

In [23]:
all_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,shop_name_cleaned,revenue,monthly_shop_item_cnt
0,02.01.2013,0,59,22154,999.0,1.0,37,ярославль тц альтаир,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0,58,москва трк атриум,899.0,0.0
2,05.01.2013,0,25,2552,899.0,-1.0,58,москва трк атриум,-899.0,0.0
3,06.01.2013,0,25,2554,1709.05,1.0,58,москва трк атриум,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0,56,москва трк атриум,1099.0,1.0


In [24]:
# 月　商品ごとの平均売上数
group = all_data.groupby(['date_block_num', 'item_id']).agg({'item_cnt_day' : 'mean'}).reset_index()
group.rename(columns={'item_cnt_day' : 'monthly_item_cnt_mean'}, inplace=True)

# 結合
all_data = all_data.merge(group, on=['date_block_num', 'item_id'], how='left')

In [25]:
all_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,shop_name_cleaned,revenue,monthly_shop_item_cnt,monthly_item_cnt_mean
0,02.01.2013,0,59,22154,999.0,1.0,37,ярославль тц альтаир,999.0,1.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0,58,москва трк атриум,899.0,0.0,0.0
2,05.01.2013,0,25,2552,899.0,-1.0,58,москва трк атриум,-899.0,0.0,0.0
3,06.01.2013,0,25,2554,1709.05,1.0,58,москва трк атриум,1709.05,1.0,1.0
4,15.01.2013,0,25,2555,1099.0,1.0,56,москва трк атриум,1099.0,1.0,1.0


In [26]:
# 月　店舗ごとの平均売上数
group = all_data.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_day' : 'mean'}).reset_index()
group.rename(columns={'item_cnt_day' : 'monthly_shop_cnt_mean'}, inplace=True)

# 結合
all_data = all_data.merge(group, on=['date_block_num', 'shop_id'], how='left')

In [27]:
all_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,shop_name_cleaned,revenue,monthly_shop_item_cnt,monthly_item_cnt_mean,monthly_shop_cnt_mean
0,02.01.2013,0,59,22154,999.0,1.0,37,ярославль тц альтаир,999.0,1.0,1.0,1.092041
1,03.01.2013,0,25,2552,899.0,1.0,58,москва трк атриум,899.0,0.0,0.0,1.178454
2,05.01.2013,0,25,2552,899.0,-1.0,58,москва трк атриум,-899.0,0.0,0.0,1.178454
3,06.01.2013,0,25,2554,1709.05,1.0,58,москва трк атриум,1709.05,1.0,1.0,1.178454
4,15.01.2013,0,25,2555,1099.0,1.0,56,москва трк атриум,1099.0,1.0,1.0,1.178454


In [28]:
# ラグ特徴量を作成する関数
def add_lag_feature(df, lags, col):
  for lag in lags:
    lag_col_name = f"{col}_lag_{lag}"
    shifted = df[['date_block_num', 'shop_id', 'item_id', col]].copy()
    shifted['date_block_num'] += lag
    shifted.rename(columns={col : lag_col_name}, inplace=True)
    df = df.merge(shifted, on=['date_block_num', 'shop_id', 'item_id'], how='left')
  return df

In [29]:
# 月4か月分だけに限定して all_data_small を使ってテスト
limited_months = [30, 31, 32, 33]
all_data_small = all_data[all_data['date_block_num'].isin(limited_months)].copy()

# ここにラグ特徴量を追加
all_data_small = add_lag_feature(all_data_small, [1,2], 'monthly_shop_item_cnt')

In [30]:
# まずは、monthly_shop_item_cnt（基本の月間売上）に対して1ヶ月・2ヶ月ラグを追加
#all_data = add_lag_feature(all_data, [1, 2], 'monthly_shop_item_cnt')

In [31]:
# train と test を分け直す！
train_data = all_data[all_data['date_block_num'] < 34].copy()
test_data = all_data[all_data['date_block_num'] == 34].copy()

In [32]:
# yは正解ラベル（販売個数）
Y = train_data['item_cnt_day']

In [33]:
# 除外するカラム
drop_cols = [
    "item_cnt_day", "date", "revenue",
    "shop_name_cleaned"
]

In [34]:
# Xを抽出
X = train_data.drop(columns = drop_cols)
X_test = test_data.drop(columns = drop_cols)

In [35]:
print(X.shape)
print(X_test.shape)
print(Y.shape)

(2935849, 8)
(214200, 8)
(2935849,)


In [36]:
!pip install -U lightgbm --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.6 MB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/3.6 MB[0m [31m20.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.6/3.6 MB[0m [31m41.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [37]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [38]:
# 学習データを訓練用と検証用に分ける
X_train, X_valid, Y_train, Y_valid = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [39]:
#LightGBM用のデータ形式に変換
train_set = lgb.Dataset(X_train, label=Y_train)
valid_set = lgb.Dataset(X_valid, label =Y_valid)

In [40]:
# モデル訓練1
params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'learning_rate' : 0.05,
    'verbose' : -1,
    'random_state' : 42
    }

In [49]:
#モデル調整2
#model = lgb.train(
    #params,
    #train_set,
    #valid_sets = [train_set, valid_set],
    #num_boost_round=1000,
   # early_stopping_rounds=50,
    #verbose_eval=50)

    #エラー出たから作り直し！！！

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [42]:
from lightgbm import LGBMRegressor

In [48]:
model = LGBMRegressor(
    objective ='regression',
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42
)

In [45]:
from lightgbm import early_stopping

In [50]:
# 学習
model.fit(
    X_train, Y_train,
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
    eval_metric='rmse',
    callbacks=[early_stopping(stopping_rounds=50)]
    )

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.215503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1439
[LightGBM] [Info] Number of data points in the train set: 2348679, number of used features: 8
[LightGBM] [Info] Start training from score 1.243683
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[313]	training's rmse: 2.04001	training's l2: 4.16162	valid_1's rmse: 1.78779	valid_1's l2: 3.1962


In [52]:
# 予測
y_pred = model.predict(X_test).clip(0, 20)

In [61]:
test_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,,34,5,5037,,
1,,34,5,5320,,
2,,34,5,5233,,
3,,34,5,5232,,
4,,34,5,5268,,


In [62]:
# 提出ファイル作成
submission = pd.read_csv("sample_submission.csv")
submission['item_cnt_month'] = y_pred
submission.to_csv('submission.csv', index = False)