In [0]:
#Import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [0]:
#Import the dataset

train_data = pd.read_csv(r'/content/sample_data/counterfeit_train.csv')
test_data = pd.read_csv(r'/content/sample_data/counterfeit_test.csv')

In [3]:
print(train_data.shape, test_data.shape)

(6818, 12) (1705, 11)


In [0]:
test_data['Counterfeit_Sales'] = np.nan
train_data['data'] = 'train'
test_data['data'] = 'test'
test_data = test_data[train_data.columns]
data_all = pd.concat([test_data, train_data], axis = 0)

In [5]:
#Check for any columns with missing values

cols_with_missing = [cols for cols in data_all.columns 
                     if data_all[cols].isnull().any()]
print(cols_with_missing)

['Counterfeit_Weight', 'Counterfeit_Sales']


In [0]:
#Replace the missing values with mean

data_all.loc[data_all['Counterfeit_Weight'].isnull(),'Counterfeit_Weight'] = data_all['Counterfeit_Weight'].mean()


In [0]:
#Drop redundant columns from the dataset

data_all.drop(['Medicine_ID', 'Active_Since'], inplace=True, axis = 1)

In [8]:
data_all.nunique()

Counterfeit_Weight      416
DistArea_ID              10
Medicine_MRP           5970
Medicine_Type            16
SidEffect_Level           2
Availability_rating    7884
Area_Type                 4
Area_City_Type            3
Area_dist_level           4
Counterfeit_Sales      3142
data                      2
dtype: int64

In [9]:
#Get list of categorical variables for Encoding

cols_objs = [cols for cols in data_all.columns if data_all[cols].dtype == 'object']
print(cols_objs)

['DistArea_ID', 'Medicine_Type', 'SidEffect_Level', 'Area_Type', 'Area_City_Type', 'Area_dist_level', 'data']


In [10]:
data_all.head(10)

Unnamed: 0,Counterfeit_Weight,DistArea_ID,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales,data
0,14.157645,Area027,85.5328,Antibiotics,mild,0.112747,CityLimits,Tier 3,Medium,,test
1,13.45,Area045,257.146,OralContraceptives,mild,0.144446,DownTown,Tier 2,Unknown,,test
2,7.1,Area045,98.1172,Antipyretics,mild,0.144221,DownTown,Tier 2,Unknown,,test
3,18.3,Area010,135.373,Tranquilizers,mild,0.100388,MidTownResidential,Tier 3,Unknown,,test
4,14.157645,Area019,112.8016,OralContraceptives,mild,0.022585,MidTownResidential,Tier 1,Small,,test
5,14.45,Area010,190.2976,OralContraceptives,mild,0.074382,MidTownResidential,Tier 3,Unknown,,test
6,14.157645,Area027,163.3656,OralContraceptives,critical,0.073134,CityLimits,Tier 3,Medium,,test
7,17.15,Area046,226.9166,Antacids,mild,0.082101,DownTown,Tier 1,Small,,test
8,14.157645,Area027,225.0534,OralContraceptives,mild,0.094614,CityLimits,Tier 3,Medium,,test
9,15.1,Area045,101.0172,Cardiac,critical,0.027763,DownTown,Tier 2,Unknown,,test


In [0]:
#Label encoding(Ordinal values) : SidEffect_Level,Area_City_Type, Area_dist_level
#One-hot encoding(Nominal values) : DistArea_ID, Mediciine_Type,Area_Type

label_X_train_cols = ['SidEffect_Level','Area_City_Type', 'Area_dist_level']
OH_X_train_cols = ['DistArea_ID', 'Medicine_Type', 'Area_Type']

In [0]:
#Apply Label Encoder to categorical columns containing ordinal values

label_X_train = data_all[label_X_train_cols]


labl_encoder = LabelEncoder()

for col in label_X_train_cols:
  label_X_train[col] = labl_encoder.fit_transform(label_X_train[col])
  

In [13]:
label_X_train.head()

Unnamed: 0,SidEffect_Level,Area_City_Type,Area_dist_level
0,1,2,1
1,1,1,3
2,1,1,3
3,1,2,3
4,1,0,2


In [0]:
#Apply One-Hot Encoder to categorical columns containing nominal values
OH_Encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

OH_X_train = pd.DataFrame(OH_Encoder.fit_transform(data_all[OH_X_train_cols]))

OH_X_train.index = data_all.index


In [0]:
#Removing categorical variables(to be replaced with one-hot and label encoded variables)

data_all.drop(OH_X_train_cols, axis=1, inplace = True)

data_all.drop(label_X_train_cols, axis=1, inplace=True)


In [16]:
data_all.shape

(8523, 5)

In [0]:
data_all = pd.concat([data_all,label_X_train,OH_X_train], axis = 1)

In [18]:
data_all.shape

(8523, 38)

In [0]:
train_data = data_all[data_all['data']=='train']
del train_data['data']
test_data = data_all[data_all['data']=='test']
test_data.drop(['Counterfeit_Sales', 'data'], axis=1, inplace=True)
print(train_data.shape, test_data.shape)

In [20]:
X = train_data.drop('Counterfeit_Sales', axis=1)
y = train_data['Counterfeit_Sales']
print(X.shape, test_data.shape)

(6818, 36) (1705, 36)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [22]:
#Simple Linear Regression

model1 = LinearRegression()
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
error1 = mean_absolute_error(y_test, pred1)
print("MAE for Simple Linear Regression model : ", error1)

MAE for Simple Linear Regression model :  808.4181797772659


In [23]:
#Random Forest Regression model with 'mean absolute error' criterion

model2 = RandomForestRegressor(n_estimators=500, criterion = 'mae',random_state= 10, n_jobs= 3)
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
error2 = mean_absolute_error(y_test, pred2)
print("MAE for RandomForest Regression model : ", error2)

MAE for RandomForest Regression model :  768.0487781150539


In [24]:
X_train.shape

(4772, 36)

In [25]:
X_test.shape

(2046, 36)

In [0]:
X_final = pd.concat([X_train, X_test] , axis=0)
y_final = pd.concat([y_train, y_test], axis=0)

In [27]:
print(X_final.shape, y_final.shape)

(6818, 36) (6818,)


In [0]:
#Fit the model on the entire dataset
model2.fit(X_final, y_final)
final_preds = model2.predict(test_data)

In [0]:
#Save test predictions to file
output = pd.DataFrame({'Id': test_data.index,
                       'SalePrice': final_preds})
output.to_csv('submission.csv', index=False)