In [37]:
import numpy as np 
import pandas as pd 
import math
from numpy import unique
import os
# import pandas_profiling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings("ignore")

In [38]:
dtypes = {
    "InvoiceNo"    : "int32",
    "StockCode"    : "int16",
    "Description"  : "int16",
    "Quantity"     : "int32",
    "InvoiceDate"  : "object",
    "UnitPrice"    : "float64",
    "CustomerID"   : "int32",
    "Country"      : "int8"
}

train_data = pd.read_csv("../input/the-great-indian-hiring-hackathon/Participants_Data_TGIH/Train.csv",dtype=dtypes, parse_dates=["InvoiceDate"])

In [39]:
train_data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6141,1583,144,3,2011-05-06 16:54:00,3.75,14056,35
1,6349,1300,3682,6,2011-05-11 07:35:00,1.95,13098,35
2,16783,2178,1939,4,2011-11-20 13:20:00,5.95,15044,35
3,16971,2115,2983,1,2011-11-22 12:07:00,0.83,15525,35
4,6080,1210,2886,12,2011-05-06 09:00:00,1.65,13952,35


In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284780 entries, 0 to 284779
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    284780 non-null  int32         
 1   StockCode    284780 non-null  int16         
 2   Description  284780 non-null  int16         
 3   Quantity     284780 non-null  int32         
 4   InvoiceDate  284780 non-null  datetime64[ns]
 5   UnitPrice    284780 non-null  float64       
 6   CustomerID   284780 non-null  int32         
 7   Country      284780 non-null  int8          
dtypes: datetime64[ns](1), float64(1), int16(2), int32(3), int8(1)
memory usage: 9.0 MB


In [41]:
train_data.describe()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
count,284780.0,284780.0,284780.0,284780.0,284780.0,284780.0,284780.0
mean,9955.394083,1573.386807,2023.955573,12.028468,3.451216,15288.224278,32.823846
std,5551.519138,843.604991,1089.812078,295.111588,78.399541,1714.336529,6.743602
min,0.0,0.0,0.0,-80995.0,0.0,12346.0,0.0
25%,5069.0,939.0,1141.0,2.0,1.25,13953.0,35.0
50%,10310.0,1521.0,1987.0,5.0,1.95,15152.0,35.0
75%,14657.0,2106.0,2945.0,12.0,3.75,16794.0,35.0
max,22188.0,3683.0,3895.0,80995.0,38970.0,18287.0,36.0


In [42]:
# report = pandas_profiling.ProfileReport(train_data)
# display(report)

In [43]:
def convert_weeks(series):
    ts = series - series.min()
    return ts.apply(lambda x: math.ceil(x.days/7))
train_data['weeks'] = convert_weeks(train_data['InvoiceDate'])

In [44]:
train_data = train_data.drop(columns=['Description','InvoiceDate','InvoiceNo','CustomerID','Quantity','Country'])

In [45]:
train_data.drop_duplicates(inplace=True)

In [46]:
train_data = train_data.drop(train_data[train_data['UnitPrice'] <= 0.1].index, axis=0)

In [47]:
def remove_outlier(cols):
    minimum_val = cols.min()
    urlimit = 5 * minimum_val
    return urlimit

In [48]:

def to_chunks(values, column_name):
    Data_StockCode_Wise = {}
    chunk_ids = unique(values[column_name])
    for chunk_id in chunk_ids:
        selection = values[column_name] == chunk_id
        Data_StockCode_Wise[chunk_id] = values[selection]
        
    return Data_StockCode_Wise

Data_StockCode_Wise_Train = to_chunks(train_data,'StockCode')

In [49]:
def train(X_train, y_train):    
    clf_dt = DecisionTreeRegressor()
    clf_lr = LinearRegression()
    clf_l = Lasso()
    clf_en = ElasticNet()
    clf_svr = SVR()

    clf_dt.fit(X_train, y_train)
    clf_lr.fit(X_train, y_train)
    clf_l.fit(X_train, y_train)
    clf_en.fit(X_train, y_train)
    clf_svr.fit(X_train, y_train)
    
    return [clf_svr, clf_lr, clf_dt, clf_l, clf_en]

In [50]:
def predict(models, X_test):
    y_pred_array = []
    for model in models:
        index_col = X_test.index
        temp_pred = model.predict(X_test).flatten()
        y_pred_array.append([temp_pred])
    ind_col     = pd.DataFrame(index_col)
    y_pred_mean = pd.DataFrame(np.mean( np.array(y_pred_array), axis=0))
    return pd.concat([ind_col, y_pred_mean.T], axis=1)

In [51]:
dtypes = {
    "InvoiceNo"    : "int32",
    "StockCode"    : "int16",
    "Description"  : "int16",
    "Quantity"     : "int32",
    "InvoiceDate"  : "object",
    "CustomerID"   : "int32",
    "Country"      : "int8"
}

test_df = pd.read_csv("../input/the-great-indian-hiring-hackathon/Participants_Data_TGIH/Test.csv",dtype=dtypes, parse_dates=["InvoiceDate"])
test_df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,CustomerID,Country
0,3139,1709,1079,3,2011-02-22 15:22:00,16161,35
1,617,510,3457,1,2010-12-08 12:46:00,17341,35
2,14653,604,694,36,2011-10-25 13:53:00,15158,35
3,8634,1478,3473,2,2011-06-27 12:38:00,16033,35
4,15546,3216,871,1,2011-11-06 16:14:00,15351,35
...,...,...,...,...,...,...,...
122044,7813,2487,1289,12,2011-06-09 09:44:00,15214,35
122045,8694,3069,217,16,2011-06-28 15:47:00,16200,35
122046,915,1419,3474,6,2010-12-13 12:26:00,15555,35
122047,8156,1631,1434,1,2011-06-16 12:45:00,15907,35


In [52]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122049 entries, 0 to 122048
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    122049 non-null  int32         
 1   StockCode    122049 non-null  int16         
 2   Description  122049 non-null  int16         
 3   Quantity     122049 non-null  int32         
 4   InvoiceDate  122049 non-null  datetime64[ns]
 5   CustomerID   122049 non-null  int32         
 6   Country      122049 non-null  int8          
dtypes: datetime64[ns](1), int16(2), int32(3), int8(1)
memory usage: 2.9 MB


In [53]:
test_df['weeks'] = convert_weeks(test_df['InvoiceDate'])
test_df = test_df.drop(columns=['Description','InvoiceDate','InvoiceNo','CustomerID','Quantity','Country'])

In [54]:
Data_StockCode_Wise_Test = to_chunks(test_df,'StockCode')

In [55]:
trained_models = {}
for StockCode_Value in Data_StockCode_Wise_Train:
    df = Data_StockCode_Wise_Train[StockCode_Value]
    X_train = df.drop(columns=['UnitPrice'])
    y_train = df.UnitPrice
    trained_models[StockCode_Value] = train(X_train, y_train)

In [56]:
preds_stockcode = {}
fill_missing_stockcode_value = train_data['UnitPrice'].mean()
for StockCode_Value in Data_StockCode_Wise_Test:
    if not (StockCode_Value in trained_models.keys()):
        ind_col = pd.DataFrame(Data_StockCode_Wise_Test[StockCode_Value].index)
        fmsv    = pd.DataFrame([fill_missing_stockcode_value]*(ind_col.shape[0]))
        preds_stockcode[StockCode_Value] = pd.concat([ind_col, fmsv], axis=1)
    else:
        preds_stockcode[StockCode_Value] = predict(trained_models[StockCode_Value], Data_StockCode_Wise_Test[StockCode_Value])

In [57]:
preds_stockcode_lis = []
for value in preds_stockcode.values():
    preds_stockcode_lis.append(value)
final_pred = pd.concat(preds_stockcode_lis)
final_pred.columns = ['Index_Col','UnitPrice']
final_pred.sort_values(by='Index_Col', ascending=True, inplace=True)

In [58]:
result = final_pred['UnitPrice']

In [59]:
result

0       1.650000
0       1.244454
0       4.058058
0       1.678768
0      12.238398
         ...    
9       0.380000
419     1.626642
100     1.134349
42      3.777822
106     4.168994
Name: UnitPrice, Length: 122049, dtype: float64