# Read this first 

* each comment and text line added here is important and relevant , dont skim through just the code
* if the comment asks you to revisit some concepts/modules , do so
* this is not complete code on its own , you need to fill in your own code to eventually start building a model

In [87]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [88]:
datafile_train="counterfeit_train.csv"
datafile_test="counterfeit_test.csv"
bd_train=pd.read_csv(datafile_train)
bd_test=pd.read_csv(datafile_test)

In [89]:
bd_train.head(5)

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [90]:
bd_train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [91]:
bd_train.shape

(6818, 12)

In [92]:
missing_col = ['Counterfeit_Weight']
#Technique 1: Using mean to impute the missing values
for i in missing_col:
    bd_train.loc[bd_train.loc[:,i].isnull(),i]=bd_train.loc[:,i].mean()

In [93]:
missing_colT = ['Counterfeit_Weight']
#Technique 1: Using mean to impute the missing values
for i in missing_colT:
    bd_test.loc[bd_test.loc[:,i].isnull(),i]=bd_test.loc[:,i].mean()

In [94]:
bd_train['Medicine_ID'].isna().sum()


0

In [95]:
bd_train['Medicine_ID']

0       RRA15
1       YVV26
2       LJC15
3       GWC40
4       QMN13
        ...  
6813    OYN80
6814    ACW12
6815    OPM10
6816    SLY12
6817    ATT10
Name: Medicine_ID, Length: 6818, dtype: object

In [96]:
bd_train['Medicine_ID'] = bd_train['Medicine_ID'].str[-2:]

In [97]:
bd_test['Medicine_ID'] = bd_test['Medicine_ID'].str[-2:]

In [98]:
bd_train['Medicine_ID'] = bd_train['Medicine_ID'].astype(str).astype(int)

In [99]:
bd_test['Medicine_ID'] = bd_test['Medicine_ID'].astype(str).astype(int)

In [100]:
# in this script we are going to ignore two column Medicing_ID, Counterfeit_Weight
# for improving your results , you should try imputing values for Counterfeit_Weight
# see if you see any pattern between Medcine_ID and Counterfeit_Weight and try to make use of it 



In [101]:
#for col in [ 'Medicine_ID', 'Counterfeit_Weight']:
#    bd_train.drop(col,1,inplace=True)
#    bd_test.drop(col,1,inplace=True)

In [102]:
# for the remaining cat vars we'll create dummies 
# you can try creating dummies with freq cutoff and see if that improves results 

In [103]:
for col in ['Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level',"DistArea_ID"]:  
    temp=pd.get_dummies(bd_train[col],prefix=col,drop_first=True)
    bd_train=pd.concat([temp,bd_train],1)
    bd_train.drop([col],1,inplace=True)
       
    temp=pd.get_dummies(bd_test[col],prefix=col,drop_first=True)
    bd_test=pd.concat([temp,bd_test],1)
    bd_test.drop([col],1,inplace=True)

In [104]:
bd_train.shape

(6818, 39)

In [105]:
bd_test.shape

(1705, 38)

In [106]:
target='Counterfeit_Sales'

In [107]:
#Split the data set for train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bd_train.drop(['Counterfeit_Sales'], axis=1), 
                                                    bd_train['Counterfeit_Sales'], test_size=0.2, random_state=42)

print('Train values shape:', X_train.shape)
print('Test values shape:', X_test.shape)
print('Train target shape:', y_train.shape)
print('Test target shape:', y_test.shape)


Train values shape: (5454, 38)
Test values shape: (1364, 38)
Train target shape: (5454,)
Test target shape: (1364,)


In [108]:
from sklearn.preprocessing import StandardScaler


sc_X = StandardScaler() 
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

cd_test=sc_X.fit_transform(bd_test)

# Normalizing continuous variables

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (0,1))

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

bd_test=scaler.fit_transform(bd_test)

In [47]:
# we are using Lasso model here , you can chose more complex algos for 
# better performance 

In [110]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,LassoCV
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold
from sklearn.metrics import fbeta_score,f1_score,mean_absolute_error
from xgboost import XGBClassifier
from scipy.stats import uniform

In [85]:
lasso_params = {'fit__alpha':[0.005, 0.02, 0.03, 0.05, 0.06]}
model=Lasso(fit_intercept=True)
# Preconfigure estimator and parameters

param_grid = {'alpha': uniform()}
kfold = KFold(n_splits=10)
# Random Search Training with 5 folds Cross Validation
clf = RandomizedSearchCV(model, param_grid, cv=kfold,
                         n_jobs=1, n_iter=100,scoring='neg_mean_absolute_error') 
                              
clf.fit(X_train, y_train)  




In [81]:
clf.best_score_

-823.8133011767568

In [82]:
# Predict label from Test data
y_pred = clf.predict(X_train)
MAE=mean_absolute_error(y_train, y_pred)
#MAE
Score = 1-(MAE/1660)
print(Score)

0.5061566161796723


In [83]:
# Predict label from Test data
y_pred = clf.predict(X_test)
MAE=mean_absolute_error(y_test, y_pred)
#MAE
Score = 1-(MAE/1660)
print(Score)

0.4902693857129712


In [124]:
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

model = GradientBoostingRegressor()
parameters = {'learning_rate': sp_randFloat(),
              'subsample'    : sp_randFloat(),
              'n_estimators' : sp_randInt(100, 1000),
              'max_depth'    : sp_randInt(4, 10)
             }
    

    
randm_src = RandomizedSearchCV(estimator=model, param_distributions = parameters,
                               cv = 2, n_iter = 10, n_jobs=-1)
randm_src.fit(X_train, y_train)    

In [125]:
y_pred = clf.predict(X_test)
MAE=mean_absolute_error(y_test, y_pred)
#MAE
Score = 1-(MAE/1660)
print(Score)

0.4902463043155485


In [None]:
params={'alpha':np.linspace(0.1,100,50)}

In [16]:
model=Lasso(fit_intercept=True)

In [17]:
grid_search=GridSearchCV(model,cv=10,param_grid=params,n_jobs=-1,verbose=10,
                         scoring='neg_mean_absolute_error')
# it will better idea to use randomised search for more complex algos

In [18]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1925s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1422s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed:    4.2s
[Parallel(n_j

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': array([  0.1    ,   2.13878,   4.17755,   6.21633,   8.2551 ,  10.29388,
        12.33265,  14.37143,  16.4102 ,  18.44898,  20.48776,  22.52653,
        24.56531,  26.60408,  28.64286,  30.68163,  32.72041,  34.75918,
        36.79796,  38.83673,  40.87551,  42.91429,  44.95306...     85.72857,  87.76735,  89.80612,  91.8449 ,  93.88367,  95.92245,
        97.96122, 100.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=10)

In [19]:
submissions=pd.DataFrame({target:grid_search.predict(bd_test)})

In [20]:
submissions.to_csv('sample_submission.csv',index=False)