# Objective: Create a baseline model
# To-do: Predict the item sales of every item in November 2015

# Import

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

item_cat = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/item_categories.csv")
items = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/items.csv")
train = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/sales_train.csv")
shops = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/shops.csv")
test = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/test.csv")
sample_sub = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/sample_submission.csv")

# Check goal

In [2]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [3]:
sample_sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


# EDA

## item_cat

In [4]:
item_cat.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [5]:
item_cat.shape

(84, 2)

In [6]:
item_cat.dtypes

item_category_name    object
item_category_id       int64
dtype: object

In [7]:
item_cat.isna().sum()

item_category_name    0
item_category_id      0
dtype: int64

In [8]:
item_cat.item_category_name.nunique()

84

In [9]:
item_cat.item_category_name.unique()[:5].tolist()

['PC - Гарнитуры/Наушники',
 'Аксессуары - PS2',
 'Аксессуары - PS3',
 'Аксессуары - PS4',
 'Аксессуары - PSP']

In [10]:
item_cat.item_category_name.value_counts()

Музыка - CD фирменного производства    1
Чистые носители (шпиль)                1
Подарки - Фигурки                      1
Кино - DVD                             1
Книги - Аудиокниги (Цифра)             1
                                      ..
Аксессуары - PSP                       1
Музыка - Винил                         1
Игровые консоли - PSP                  1
Программы - Для дома и офиса           1
Игры - PSP                             1
Name: item_category_name, Length: 84, dtype: int64

In [11]:
item_cat["big_category"] = [
    x.split("-")[0].strip() for x in item_cat.item_category_name
]

In [12]:
item_cat.head()

Unnamed: 0,item_category_name,item_category_id,big_category
0,PC - Гарнитуры/Наушники,0,PC
1,Аксессуары - PS2,1,Аксессуары
2,Аксессуары - PS3,2,Аксессуары
3,Аксессуары - PS4,3,Аксессуары
4,Аксессуары - PSP,4,Аксессуары


In [13]:
item_cat.big_category.nunique()

20

In [14]:
item_cat.big_category.value_counts()

Книги                                13
Подарки                              12
Игровые консоли                       8
Игры                                  8
Аксессуары                            7
Программы                             6
Музыка                                6
Кино                                  5
Игры PC                               4
Карты оплаты                          4
Служебные                             2
Карты оплаты (Кино, Музыка, Игры)     1
Билеты (Цифра)                        1
Элементы питания                      1
Чистые носители (шпиль)               1
Игры MAC                              1
PC                                    1
Игры Android                          1
Чистые носители (штучные)             1
Доставка товара                       1
Name: big_category, dtype: int64

In [15]:
rus_eng = {
    "Книги": "books",
    "Подарки": "present",
    "Игры": "games",
    "Игровые консоли": "game consoles",
    "Аксессуары": "accesories",
    "Программы": "programs",
    "Музыка": "music",
    "Кино": "cinema",
    "Карты оплаты": "gift_cards",
    "Игры PC": "pc_games",
    "Служебные": "services",
    "Доставка товара": "delivery",
    "Карты оплаты (Кино, Музыка, Игры)": "payment_cards",
    "Чистые носители (шпиль)": "cd",
    "Элементы питания": "battery",
    "Игры Android": "android_games",
    "Игры MAC": "mac_games",
    "Билеты (Цифра)": "tickets",
    "PC": "pc",
    "Чистые носители (штучные)": "dvd",
}

In [16]:
item_cat.big_category.replace(to_replace=rus_eng, inplace=True)

In [17]:
item_cat.big_category.value_counts()

books            13
present          12
game consoles     8
games             8
accesories        7
music             6
programs          6
cinema            5
pc_games          4
gift_cards        4
services          2
android_games     1
battery           1
pc                1
dvd               1
mac_games         1
tickets           1
cd                1
delivery          1
payment_cards     1
Name: big_category, dtype: int64

In [18]:
item_cat

Unnamed: 0,item_category_name,item_category_id,big_category
0,PC - Гарнитуры/Наушники,0,pc
1,Аксессуары - PS2,1,accesories
2,Аксессуары - PS3,2,accesories
3,Аксессуары - PS4,3,accesories
4,Аксессуары - PSP,4,accesories
...,...,...,...
79,Служебные,79,services
80,Служебные - Билеты,80,services
81,Чистые носители (шпиль),81,cd
82,Чистые носители (штучные),82,dvd


## items

In [19]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [20]:
items.shape

(22170, 3)

In [21]:
items.nunique()

item_name           22170
item_id             22170
item_category_id       84
dtype: int64

In [22]:
full_items = pd.merge(
    items.drop(columns="item_name"),
    item_cat.drop(columns="item_category_name"),
    on="item_category_id",
)

In [23]:
full_items.head()

Unnamed: 0,item_id,item_category_id,big_category
0,0,40,cinema
1,2,40,cinema
2,3,40,cinema
3,4,40,cinema
4,5,40,cinema


## shops

In [24]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [25]:
shops.shape

(60, 2)

In [26]:
shops.nunique()

shop_name    60
shop_id      60
dtype: int64

In [27]:
shops.shop_name.value_counts()

Интернет-магазин ЧС                                1
Казань ТЦ "ПаркХаус" II                            1
Калуга ТРЦ "XXI век"                               1
Томск ТРЦ "Изумрудный Город"                       1
Новосибирск ТРЦ "Галерея Новосибирск"              1
Москва ТЦ "Ареал" (Беляево)                        1
Ярославль ТЦ "Альтаир"                             1
!Якутск Орджоникидзе, 56 фран                      1
Уфа ТК "Центральный"                               1
Якутск ТЦ "Центральный"                            1
Курск ТЦ "Пушкинский"                              1
СПб ТК "Невский Центр"                             1
Чехов ТРЦ "Карнавал"                               1
Сургут ТРЦ "Сити Молл"                             1
Новосибирск ТЦ "Мега"                              1
Москва ТЦ "Серебряный Дом"                         1
Жуковский ул. Чкалова 39м?                         1
Тюмень ТЦ "Зеленый Берег"                          1
РостовНаДону ТЦ "Мега"                        

In [28]:
city = [x.split(" ")[0] for x in shops.shop_name]
shops["city"] = city

In [29]:
shops["city"] = city

In [30]:
shops.head()

Unnamed: 0,shop_name,shop_id,city
0,"!Якутск Орджоникидзе, 56 фран",0,!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",1,!Якутск
2,"Адыгея ТЦ ""Мега""",2,Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха
4,"Волжский ТЦ ""Волга Молл""",4,Волжский


In [31]:
shops.city.value_counts()

Москва              13
Воронеж              3
Тюмень               3
РостовНаДону         3
СПб                  2
Казань               2
Самара               2
!Якутск              2
Жуковский            2
Н.Новгород           2
Уфа                  2
Якутск               2
Красноярск           2
Новосибирск          2
Выездная             1
Омск                 1
Интернет-магазин     1
Чехов                1
Сургут               1
Курск                1
Волжский             1
Коломна              1
Ярославль            1
Томск                1
Балашиха             1
Адыгея               1
Химки                1
Цифровой             1
Вологда              1
Сергиев              1
Калуга               1
Мытищи               1
Name: city, dtype: int64

In [32]:
shops.loc[shops["city"] == "!Якутск", "city"] = "Якутск"

In [33]:
shops.city.value_counts()

Москва              13
Якутск               4
Воронеж              3
Тюмень               3
РостовНаДону         3
Н.Новгород           2
Самара               2
Казань               2
Жуковский            2
Уфа                  2
СПб                  2
Красноярск           2
Новосибирск          2
Ярославль            1
Коломна              1
Омск                 1
Интернет-магазин     1
Чехов                1
Сургут               1
Волжский             1
Курск                1
Выездная             1
Балашиха             1
Томск                1
Адыгея               1
Химки                1
Цифровой             1
Вологда              1
Сергиев              1
Калуга               1
Мытищи               1
Name: city, dtype: int64

## train

In [35]:
train = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/sales_train.csv")

In [36]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [37]:
full_train = pd.merge(train, full_items, on="item_id", how="left")


In [38]:
full_train = pd.merge(
    full_train, shops.drop(columns="shop_name"), on="shop_id", how="left"
)

In [39]:
data = train.pivot_table(
    index=["shop_id", "item_id"],
    values=["item_cnt_day"],
    columns=["date_block_num"],
    aggfunc="sum",
).fillna(0)

In [40]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
shop_id,item_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
data.reset_index(inplace=True)

In [42]:
data.rename(columns=({"item_cnt_day": "item_cnt_month"}), inplace=True)

In [43]:
data.head()

Unnamed: 0_level_0,shop_id,item_id,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month,item_cnt_month
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
full_train = pd.merge(full_train, data, on=["shop_id", "item_id"], how="left")

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [45]:
full_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id,big_category,city,"(item_cnt_month, 0)",...,"(item_cnt_month, 24)","(item_cnt_month, 25)","(item_cnt_month, 26)","(item_cnt_month, 27)","(item_cnt_month, 28)","(item_cnt_month, 29)","(item_cnt_month, 30)","(item_cnt_month, 31)","(item_cnt_month, 32)","(item_cnt_month, 33)"
0,02.01.2013,0,59,22154,999.0,1.0,37,cinema,Ярославль,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,03.01.2013,0,25,2552,899.0,1.0,58,music,Москва,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,05.01.2013,0,25,2552,899.0,-1.0,58,music,Москва,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,06.01.2013,0,25,2554,1709.05,1.0,58,music,Москва,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15.01.2013,0,25,2555,1099.0,1.0,56,music,Москва,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
full_train.shape

(2935849, 43)

In [47]:
names = {
    "date_block_num": "date_block_num",
    "shop_id": "shop_id",
    "item_id": "item_id",
    "item_category_id": "item_category_id",
    "big_category": "big_category",
    "city": "city",
    ("item_cnt_month", 0): "item_cnt_month_0",
    ("item_cnt_month", 1): "item_cnt_month_1",
    ("item_cnt_month", 2): "item_cnt_month_2",
    ("item_cnt_month", 3): "item_cnt_month_3",
    ("item_cnt_month", 4): "item_cnt_month_4",
    ("item_cnt_month", 5): "item_cnt_month_5",
    ("item_cnt_month", 6): "item_cnt_month_6",
    ("item_cnt_month", 7): "item_cnt_month_7",
    ("item_cnt_month", 8): "item_cnt_month_8",
    ("item_cnt_month", 9): "item_cnt_month_9",
    ("item_cnt_month", 10): "item_cnt_month_10",
    ("item_cnt_month", 11): "item_cnt_month_11",
    ("item_cnt_month", 12): "item_cnt_month_12",
    ("item_cnt_month", 13): "item_cnt_month_13",
    ("item_cnt_month", 14): "item_cnt_month_14",
    ("item_cnt_month", 15): "item_cnt_month_15",
    ("item_cnt_month", 16): "item_cnt_month_16",
    ("item_cnt_month", 17): "item_cnt_month_17",
    ("item_cnt_month", 18): "item_cnt_month_18",
    ("item_cnt_month", 19): "item_cnt_month_19",
    ("item_cnt_month", 20): "item_cnt_month_20",
    ("item_cnt_month", 21): "item_cnt_month_21",
    ("item_cnt_month", 22): "item_cnt_month_22",
    ("item_cnt_month", 23): "item_cnt_month_23",
    ("item_cnt_month", 24): "item_cnt_month_24",
    ("item_cnt_month", 25): "item_cnt_month_25",
    ("item_cnt_month", 26): "item_cnt_month_26",
    ("item_cnt_month", 27): "item_cnt_month_27",
    ("item_cnt_month", 28): "item_cnt_month_28",
    ("item_cnt_month", 29): "item_cnt_month_29",
    ("item_cnt_month", 30): "item_cnt_month_30",
    ("item_cnt_month", 31): "item_cnt_month_31",
    ("item_cnt_month", 32): "item_cnt_month_32",
    ("item_cnt_month", 33): "item_cnt_month_33",
}

In [48]:
full_train.rename(columns=names,inplace=True)

In [49]:
full_train.shape

(2935849, 43)

In [50]:
train.shape

(2935849, 6)

In [51]:
full_train.drop(columns=["item_price", "item_cnt_day", "date",'date_block_num'], inplace=True)

In [52]:
full_train

Unnamed: 0,shop_id,item_id,item_category_id,big_category,city,item_cnt_month_0,item_cnt_month_1,item_cnt_month_2,item_cnt_month_3,item_cnt_month_4,...,item_cnt_month_24,item_cnt_month_25,item_cnt_month_26,item_cnt_month_27,item_cnt_month_28,item_cnt_month_29,item_cnt_month_30,item_cnt_month_31,item_cnt_month_32,item_cnt_month_33
0,59,22154,37,cinema,Ярославль,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,2552,58,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25,2552,58,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,2554,58,music,Москва,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,2555,56,music,Москва,1.0,1.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935844,25,7409,55,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2935845,25,7460,55,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1.0,4.0,4.0,2.0,2.0,0.0,2.0,3.0
2935846,25,7459,55,music,Москва,0.0,0.0,1.0,2.0,0.0,...,2.0,0.0,2.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0
2935847,25,7440,57,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0


In [53]:
# plt_df = full_train.groupby(["date_block_num", "big_category"], as_index=False).sum()

# plt.f(figsize=(20, 10))
# sns.lineplot(x="date_block_num", y="item_cnt_month", data=plt_df, hue="big_category")
# plt.title("Monthly Item Counts by Big Category")

In [54]:
# plt_df = full_train.groupby(["date_block_num", "city"], as_index=False).sum()

# plt.figure(figsize=(20, 10))
# sns.lineplot(x="date_block_num", y="item_cnt_month", data=plt_df, hue="city")
# plt.title("Monthly Item Counts by City")

## test

In [55]:
test = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/test.csv")

In [56]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [57]:
test.isna().sum()

ID         0
shop_id    0
item_id    0
dtype: int64

In [58]:
test.drop(columns="ID", inplace=True)

In [59]:
test.head()

Unnamed: 0,shop_id,item_id
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


# Feature Engineering

In [60]:
full_train

Unnamed: 0,shop_id,item_id,item_category_id,big_category,city,item_cnt_month_0,item_cnt_month_1,item_cnt_month_2,item_cnt_month_3,item_cnt_month_4,...,item_cnt_month_24,item_cnt_month_25,item_cnt_month_26,item_cnt_month_27,item_cnt_month_28,item_cnt_month_29,item_cnt_month_30,item_cnt_month_31,item_cnt_month_32,item_cnt_month_33
0,59,22154,37,cinema,Ярославль,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,2552,58,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25,2552,58,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,2554,58,music,Москва,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,2555,56,music,Москва,1.0,1.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935844,25,7409,55,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2935845,25,7460,55,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1.0,4.0,4.0,2.0,2.0,0.0,2.0,3.0
2935846,25,7459,55,music,Москва,0.0,0.0,1.0,2.0,0.0,...,2.0,0.0,2.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0
2935847,25,7440,57,music,Москва,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0


In [61]:
full_train = full_train.drop_duplicates(subset=['shop_id','item_id'])

In [62]:
full_train.reset_index(drop=True,inplace=True)

In [63]:
from sklearn.preprocessing import LabelEncoder

In [64]:
full_train['big_category'] = LabelEncoder.fit_transform(
    full_train, full_train.big_category
)

  self.classes_, y = _unique(y, return_inverse=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_train['big_category'] = LabelEncoder.fit_transform(


In [65]:
full_train.city = LabelEncoder.fit_transform(full_train, full_train.city)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [66]:
full_train.head()

Unnamed: 0,shop_id,item_id,item_category_id,big_category,city,item_cnt_month_0,item_cnt_month_1,item_cnt_month_2,item_cnt_month_3,item_cnt_month_4,...,item_cnt_month_24,item_cnt_month_25,item_cnt_month_26,item_cnt_month_27,item_cnt_month_28,item_cnt_month_29,item_cnt_month_30,item_cnt_month_31,item_cnt_month_32,item_cnt_month_33
0,59,22154,37,5,30,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,2552,58,12,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25,2554,58,12,13,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25,2555,56,12,13,1.0,1.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,2564,59,12,13,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
X_train = full_train.iloc[:, :-1].drop(columns=['shop_id','item_id','big_category','item_category_id','city'])
#,'big_category','item_category_id','city'

In [69]:
X_train.head()

Unnamed: 0,item_cnt_month_0,item_cnt_month_1,item_cnt_month_2,item_cnt_month_3,item_cnt_month_4,item_cnt_month_5,item_cnt_month_6,item_cnt_month_7,item_cnt_month_8,item_cnt_month_9,...,item_cnt_month_23,item_cnt_month_24,item_cnt_month_25,item_cnt_month_26,item_cnt_month_27,item_cnt_month_28,item_cnt_month_29,item_cnt_month_30,item_cnt_month_31,item_cnt_month_32
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
X_train.reset_index(drop=True,inplace=True)

In [75]:
X_test

Unnamed: 0,item_cnt_month_1,item_cnt_month_2,item_cnt_month_3,item_cnt_month_4,item_cnt_month_5,item_cnt_month_6,item_cnt_month_7,item_cnt_month_8,item_cnt_month_9,item_cnt_month_10,...,item_cnt_month_24,item_cnt_month_25,item_cnt_month_26,item_cnt_month_27,item_cnt_month_28,item_cnt_month_29,item_cnt_month_30,item_cnt_month_31,item_cnt_month_32,item_cnt_month_33
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
424120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
424121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
424122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
X_train.shape

In [None]:
y_train = full_train.iloc[:, -1]

In [None]:
y_train

In [74]:
X_test = full_train.drop(columns=["item_cnt_month_0",'item_id','shop_id','big_category','item_category_id','city'])
#'big_category','item_category_id','city'

In [None]:
X_test.reset_index(drop=True,inplace=True)

In [None]:
X_test

# Validation

In [None]:
# Import and fitting

In [None]:
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [None]:
def lr_parameters(X: pd.DataFrame, y: pd.Series) -> object:
    lr = (
        GridSearchCV(
            estimator=LogisticRegression(),
            param_grid={"C": [x for x in range(1, 8, 1)], "random_state": [3],},
            cv=3,
            verbose=True,
            scoring="accuracy",
        )
        .fit(X, y)
        .best_estimator_
    )
    return lr

In [None]:
def knn_parameters(X: pd.DataFrame, y: pd.Series) -> object:
    knn = (
        GridSearchCV(
            estimator=KNeighborsClassifier(),
            param_grid={
                "n_neighbors": [4, 5, 6, 7],
                "leaf_size": [x for x in range(5, 50, 5)],
                "p": [1, 2],
                "weights": ["uniform", "distance"],
            },
            cv=3,
            verbose=True,
            scoring="accuracy",
        )
        .fit(X, y)
        .best_estimator_
    )
    return knn

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
lr = LogisticRegression()
LGBM = lgb.LGBMClassifier()
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

In [None]:
cv = cross_val_score(knn, X_train, y_train, cv=3)
print(cv)
print(cv.mean())

In [None]:
LGBM.fit(X_train, y_train)

In [None]:
L

In [None]:
vt_clf = VotingClassifier(
    estimators=[('rf',rf),('LGBM',LGBM)],
    voting='soft'
)

In [None]:
# cross validation

In [None]:
cv = cross_val_score(LGBM, X_train, y_train, cv=3)
print(cv)
print(cv.mean())

In [None]:
cv = cross_val_score(vt_clf, X_train, y_train, cv=3)
print(cv)
print(cv.mean())

# Submission

In [None]:
y_test = rf.predict(X_test)

In [None]:
y_test = pd.DataFrame(y_test)

In [None]:
y_test.rename(columns={0:'item_cnt_month'},inplace=True)

In [None]:
test_final = pd.concat([X_test,y_test],axis=1)

In [None]:
test_final['item_id'] = full_train['item_id']

In [None]:
test_final['shop_id'] = full_train['shop_id']

In [None]:
test_final

In [None]:
final = pd.merge(test,test_final,on=['shop_id','item_id'],how='left')

In [None]:
final

In [None]:
final.fillna(0,inplace=True)

In [None]:
final

In [None]:
test = pd.read_csv("/Users/leo/samurai/kaggle/pfs/data/test.csv")
test

In [None]:
final['ID'] = test['ID']

In [None]:
submission = final[['ID','item_cnt_month']]

In [None]:
submission.item_cnt_month.describe()

In [None]:
submission.to_csv('pfs_sub1.csv',index=False)

In [None]:
basic_submission = {"PassengerId": test.PassengerId, "Survived": train_survived}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv("base_submission21.csv", index=False)