# FINAL PROJECT (PREDICT FUTURE SALES)

## 1.Data Loading and Analysis

In [None]:
import os
print(os.listdir("../input"))

In [None]:
import numpy as np
import pandas as pd
from sklearn import *
import nltk, datetime
import seaborn as sns
import pickle
import sys
import time
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
# set index to ID to avoid droping it later
test  = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
print('train:', train.shape, 'test:', test.shape)
sys.version_info

## 2.EDA

In [None]:
#Checking the outliers
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=train.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price)
plt.show()

#### 2.1 Removing Outliers

In [None]:
#Removing the outliers
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

#### 2.2 Replacing Negative Item Price with Median value

In [None]:
train.loc[train.item_price<0, 'item_price']

In [None]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

#### 2.3 Comparing train and test set for detecting differences

In [None]:
[c for c in train.columns if c not in test.columns]

In [None]:
train.head()

In [None]:
test.head()

## 3. Feature Engineering

#### 3.1 Categorising item categories

In [None]:
l = list(item_cats.item_category_name)
l_cat = l

for ind in range(1,8):
    l_cat[ind] = 'Access'

for ind in range(10,18):
    l_cat[ind] = 'Consoles'

for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'

for ind in range(26,28):
    l_cat[ind] = 'phone games'

for ind in range(28,32):
    l_cat[ind] = 'CD games'

for ind in range(32,37):
    l_cat[ind] = 'Card'

for ind in range(37,43):
    l_cat[ind] = 'Movie'

for ind in range(43,55):
    l_cat[ind] = 'Books'

for ind in range(55,61):
    l_cat[ind] = 'Music'

for ind in range(61,73):
    l_cat[ind] = 'Gifts'

for ind in range(73,79):
    l_cat[ind] = 'Soft'


item_cats['cats'] = l_cat
item_cats.head()

### 3.2 Creating Text Features

#### 3.2.1 Text features for item name

In [None]:
to_drop_cols = []
#Text Features
feature_cnt = 2
tfidf = feature_extraction.text.TfidfVectorizer(max_features=feature_cnt)
items['item_name_len'] = items['item_name'].map(len) #Lenth of Item Description
items['item_name_wc'] = items['item_name'].map(lambda x: len(str(x).split(' '))) #Item Description Word Count
txtFeatures = pd.DataFrame(tfidf.fit_transform(items['item_name']).toarray())
cols = list(txtFeatures.columns)
for i in range(feature_cnt):
    items['item_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
    to_drop_cols.append('item_name_tfidf_' + str(i))
items.head()

#### 3.2.2 Text features for item category name

In [None]:
#Text Features
feature_cnt = 2
tfidf = feature_extraction.text.TfidfVectorizer(max_features=feature_cnt)
item_cats['item_category_name_len'] = item_cats['item_category_name'].map(len)  #Lenth of Item Category Description
item_cats['item_category_name_wc'] = item_cats['item_category_name'].map(lambda x: len(str(x).split(' '))) #Item Category Description Word Count
txtFeatures = pd.DataFrame(tfidf.fit_transform(item_cats['item_category_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
    item_cats['item_category_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
    to_drop_cols.append('item_category_name_tfidf_' + str(i))
item_cats.head()

#### 3.2.3 Text features for shop name

In [None]:
#Text Features
feature_cnt = 2
tfidf = feature_extraction.text.TfidfVectorizer(max_features=feature_cnt)
shops['shop_name_len'] = shops['shop_name'].map(len)  #Lenth of Shop Name
shops['shop_name_wc'] = shops['shop_name'].map(lambda x: len(str(x).split(' '))) #Shop Name Word Count
txtFeatures = pd.DataFrame(tfidf.fit_transform(shops['shop_name']).toarray())
cols = txtFeatures.columns
for i in range(feature_cnt):
    shops['shop_name_tfidf_' + str(i)] = txtFeatures[cols[i]]
    to_drop_cols.append('shop_name_tfidf_' + str(i))
shops.head()

### 3.3.1 Creating date block number, shop and item pairs for training part

In [None]:
from itertools import product
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts

In [None]:
print(matrix.memory_usage())

### 3.2.2 Working on the test set

In [None]:
test['date_block_num'] = 34
#Items features
test = pd.merge(test, items, how='left', on='item_id')
#Item Category features
test = pd.merge(test, item_cats, how='left', on='item_category_id')
#Shops features
test = pd.merge(test, shops, how='left', on='shop_id')

test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
test.head()

matrix = train[['date_block_num','shop_id','item_id']]
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)

cols = ['date_block_num','shop_id','item_id']
matrix.sort_values(cols,inplace=True)
print('\t'.join(matrix.columns))
print(matrix.memory_usage())

### 3.2.3 Working on training set

In [None]:
import time
ts = time.time()
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, item_cats, on=['item_category_id'], how='left')

matrix['item_name_len'] = matrix['item_name_len'].astype(np.int8)
matrix['item_name_wc'] = matrix['item_name_wc'].astype(np.int8)
matrix['shop_name_len'] = matrix['shop_name_len'].astype(np.int8)
matrix['shop_name_wc'] = matrix['shop_name_wc'].astype(np.int8)
matrix['item_category_name_len'] = matrix['item_category_name_len'].astype(np.int8)
matrix['item_category_name_wc'] = matrix['item_category_name_wc'].astype(np.int8)
for i in to_drop_cols:
    matrix[i] = matrix[i].astype(np.float32)
print('\t'.join(matrix.columns))
print(time.time() - ts)
print(matrix.shape)

#### Function definition for creating lag

In [None]:
def lag_feature(df,test, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        test = pd.merge(test, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df , test

#### Generating monthly target values

In [None]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id','item_id']).agg({
    'item_cnt_day': ['sum', 'count']
})
group.columns = ['item_cnt_month', 'orders']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip trget here
                                .astype(np.float16))
matrix['orders'] = matrix['orders'].fillna(0).astype(np.float16)
time.time() - ts

#### Garbage collection to free ram space

In [None]:
import gc
collected = gc.collect() 
  
print ("Garbage collector: collected %d objects." % (collected) )

#### Mean Encoding of different features

In [None]:
ts=time.time()
for type_ids in [['item_id'], ['shop_id'], ['cats'], ['item_id', 'shop_id']]:
    for column_id in ['item_cnt_month']:
        mean_df = matrix[type_ids + [column_id]].groupby(type_ids).agg(np.mean).reset_index()
        mean_df.rename(
            {column_id: "mean_of_"+column_id+"_groupby_"+"_".join(type_ids)},
            axis='columns', inplace=True
        )
        
        matrix = pd.merge(matrix, mean_df, on=type_ids, how='left')
        test = pd.merge(test, mean_df, on=type_ids, how='left')
time.time() - ts

#### Garbage collection at regular intervals to free ram space

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Creating Lag

In [None]:
ts = time.time()
matrix,test = lag_feature(matrix,test, [1,2,3,6,12], 'item_cnt_month')
matrix,test = lag_feature(matrix,test, [1,2,3,6,12], 'orders')
time.time() - ts

#### Creating mean item price feature

In [None]:
ts = time.time()
group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['item_price'] = matrix['item_price'].astype(np.float32) 

time.time() - ts

In [None]:
matrix.isnull().any()

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - date_block_num & item_id

In [None]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
group.columns = [ 'date_item_avg_item_cnt', 'date_item_sum_orders', 'date_item_avg_item_price' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix['date_item_sum_orders'] = matrix['date_item_sum_orders'].astype(np.float16)
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float32)
for i in [ 'date_item_avg_item_cnt', 'date_item_sum_orders', 'date_item_avg_item_price' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - shop_id & item_id

In [None]:
ts = time.time()
group = matrix.groupby(['shop_id', 'item_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
group.columns = [ 'shop_item_avg_item_cnt', 'shop_item_sum_orders', 'shop_item_avg_item_price' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id','item_id'], how='left')
matrix['shop_item_avg_item_cnt'] = matrix['shop_item_avg_item_cnt'].astype(np.float16)
matrix['shop_item_sum_orders'] = matrix['shop_item_sum_orders'].astype(np.float16)
matrix['shop_item_avg_item_price'] = matrix['shop_item_avg_item_price'].astype(np.float32)
for i in [ 'shop_item_avg_item_cnt', 'shop_item_sum_orders', 'shop_item_avg_item_price' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - date_block_num & shop_id

In [None]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
group.columns = [ 'date_shop_avg_item_cnt', 'date_shop_sum_orders', 'date_shop_avg_item_price' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix['date_shop_sum_orders'] = matrix['date_shop_sum_orders'].astype(np.float16)
matrix['date_shop_avg_item_price'] = matrix['date_shop_avg_item_price'].astype(np.float32)
for i in [ 'date_shop_avg_item_cnt', 'date_shop_sum_orders', 'date_shop_avg_item_price' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - date_block_num & item_category_id

In [None]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({
    'item_cnt_month': ['mean'],
    'orders': ['sum'],
    'item_price': ['mean']
})
group.columns = [ 'date_cat_avg_item_cnt', 'date_cat_sum_orders', 'date_cat_avg_item_price' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix['date_cat_sum_orders'] = matrix['date_cat_sum_orders'].astype(np.float16)
matrix['date_cat_avg_item_price'] = matrix['date_cat_avg_item_price'].astype(np.float32)
for i in [ 'date_cat_avg_item_cnt', 'date_cat_sum_orders', 'date_cat_avg_item_price' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - shop_id & item_id

In [None]:
ts = time.time()
group = matrix.groupby(['shop_id','item_id']).agg({
    'item_cnt_month': ['mean']
})
group.columns = [ 'shop_item_monthly_mean' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id','item_id'], how='left')
matrix['shop_item_monthly_mean'] = matrix['shop_item_monthly_mean'].astype(np.float16)
for i in [ 'shop_item_monthly_mean' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - shop_id

In [None]:
ts = time.time()
group = matrix.groupby(['shop_id']).agg({
    'item_cnt_month': ['mean']
})
group.columns = [ 'shop_monthly_mean' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_monthly_mean'] = matrix['shop_monthly_mean'].astype(np.float16)
for i in [ 'shop_monthly_mean' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating features by grouping - item_id

In [None]:
ts = time.time()
group = matrix.groupby(['item_id']).agg({
    'item_cnt_month': ['mean']
})
group.columns = [ 'item_monthly_mean' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_monthly_mean'] = matrix['item_monthly_mean'].astype(np.float16)
for i in [ 'item_monthly_mean' ]:
    matrix,test = lag_feature(matrix,test, [1,2,3,6,12], i)
    matrix.drop([i], axis=1, inplace=True)
time.time() - ts

In [None]:
print(matrix.shape)

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

## Merging the Train and Test set

In [None]:
ts = time.time()
matrix = pd.concat([matrix, test], ignore_index=True, keys=cols)
matrix.fillna(0, inplace=True)
time.time() - ts

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

#### Generating month feature

In [None]:
matrix['month'] = matrix['date_block_num'] % 12 + 1

In [None]:
print('\n'.join(matrix.columns))


In [None]:
matrix.shape

In [None]:
del train
del test
del items
del item_cats
del shops
del group
data = matrix

In [None]:
print('\n'.join(data.columns))
print(data.shape)

#### Deleting data points before 2013 since we have created a lag for 1,2,3,6 & 12 months
#### Also dropping some irrelevant features

In [None]:
ts = time.time()
data = data[data.date_block_num > 11]
data.drop([
    'orders', 
    'cats' ,
    'item_price',
    'shop_id' ,
    'shop_name' ,
    'item_name' ,
    'item_id' ,
    'item_category_name' ,
    'item_category_id'
], axis=1, inplace=True)
time.time() - ts

In [None]:
print('\t'.join(data.columns))
print(data.shape)

#### Filling null values with appropriate values

In [None]:
ts = time.time()
def fill_na(df):
    for col in df.columns:
        if df[col].isnull().any():
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)
            if ('orders' in col):
                df[col].fillna(0, inplace=True)
            if ('item_price' in col):
                df[col].fillna(df[col].median(), inplace=True)
    return df

data = fill_na(data)
time.time() - ts

In [None]:
data.to_pickle('feature.pickle')

In [None]:
collected = gc.collect() 
print( "Garbage collector: collected %d objects." % (collected) )

In [None]:
data.info()

# Note: Takes considerable amount of time to run. I ran it on kaggle kernel!**