In [1]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.cluster import hierarchy as hc
import os
import numpy as np

# Data Loading

In [2]:
types_dict_train = {'train_id': 'int64',
             'item_condition_id': 'int8',
             'price': 'float64',
             'shipping': 'int8'}

In [3]:
types_dict_test = {'test_id': 'int64',
             'item_condition_id': 'int8',
             'shipping': 'int8'}

In [4]:
#read data for local machine
curr_dir= os.getcwd()
train = pd.read_csv(curr_dir+"//Dataset/train.tsv", sep='\t')
test = pd.read_csv(curr_dir+"//Dataset/test.tsv",sep='\t')

In [5]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [6]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [7]:
train.shape,test.shape

((1482535, 8), (693359, 7))

In [8]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)

In [9]:
display_all(train.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
train_id,1482540.0,,,,741267.0,427971.0,0.0,370634.0,741267.0,1111900.0,1482530.0
name,1482535.0,1225273.0,Bundle,2232.0,,,,,,,
item_condition_id,1482540.0,,,,1.90738,0.903159,1.0,1.0,2.0,3.0,5.0
category_name,1476208.0,1287.0,"Women/Athletic Apparel/Pants, Tights, Leggings",60177.0,,,,,,,
brand_name,849853.0,4809.0,PINK,54088.0,,,,,,,
price,1482540.0,,,,26.7375,38.5861,0.0,10.0,17.0,29.0,2009.0
shipping,1482540.0,,,,0.447274,0.497212,0.0,0.0,0.0,1.0,1.0
item_description,1482531.0,1281426.0,No description yet,82489.0,,,,,,,


### convert category name, item description, name and brand_name into categorical value

In [10]:

train.category_name = train.category_name.astype('category')
train.item_description = train.item_description.astype('category')

train.name = train.name.astype('category')
train.brand_name = train.brand_name.astype('category')

In [11]:
test.category_name = test.category_name.astype('category')
test.item_description = test.item_description.astype('category')

test.name = test.name.astype('category')
test.brand_name = test.brand_name.astype('category')

### Check the type, unique values and null values in our dataset

In [12]:
train.dtypes

train_id                int64
name                 category
item_condition_id       int64
category_name        category
brand_name           category
price                 float64
shipping                int64
item_description     category
dtype: object

In [13]:
test.dtypes

test_id                 int64
name                 category
item_condition_id       int64
category_name        category
brand_name           category
shipping                int64
item_description     category
dtype: object

In [14]:
train.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64

In [15]:
test.apply(lambda x: x.nunique())

test_id              693359
name                 601117
item_condition_id         5
category_name          1223
brand_name             3900
shipping                  2
item_description     609555
dtype: int64

In [18]:
print(train.isnull().sum())
print('-----------------------------------')
train.isnull().sum()/train.shape[0]

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64
-----------------------------------


train_id             0.000000
name                 0.000000
item_condition_id    0.000000
category_name        0.004268
brand_name           0.426757
price                0.000000
shipping             0.000000
item_description     0.000003
dtype: float64

In [19]:
print(test.isnull().sum())
print('------------------------')
test.isnull().sum()/test.shape[0]

test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64
------------------------


test_id              0.000000
name                 0.000000
item_condition_id    0.000000
category_name        0.004410
brand_name           0.426222
shipping             0.000000
item_description     0.000000
dtype: float64

# Model Building

In [20]:
train = train.rename(columns = {'train_id':'id'})

In [21]:
train.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [22]:
test = test.rename(columns = {'test_id':'id'})

In [23]:
test.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [25]:
#adding new column so that train and test can be combine
train['is_train'] = 1
test['is_train'] = 0

In [26]:
#combine train and test dataset by droping 'price'
train_test_combine = pd.concat([train.drop(['price'],axis =1),test],axis = 0)

In [27]:
#converting into categorical values
train_test_combine.category_name = train_test_combine.category_name.astype('category')
train_test_combine.item_description = train_test_combine.item_description.astype('category')

train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')

In [28]:
#drop item description
train_test_combine = train_test_combine.drop(['item_description'],axis = 1)

In [29]:
#use label encoding for name, category_name and brand_name
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes

In [30]:
train_test_combine.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,is_train
0,0,916335,3,829,-1,1,1
1,1,1292428,3,86,3889,0,1
2,2,131013,1,1277,4588,1,1
3,3,802671,1,503,-1,1,1
4,4,65051,1,1204,-1,0,1


In [31]:
train_test_combine.dtypes

id                   int64
name                 int32
item_condition_id    int64
category_name        int16
brand_name           int16
shipping             int64
is_train             int64
dtype: object

In [32]:
#split the combine dataset into train and test
df_test = train_test_combine.loc[train_test_combine['is_train']==0]
df_train = train_test_combine.loc[train_test_combine['is_train']==1]

In [33]:
#drop the columns is_train 
df_test = df_test.drop(['is_train'],axis=1)
df_train = df_train.drop(['is_train'],axis=1)

In [34]:
df_test.shape

(693359, 6)

In [35]:
df_train.shape

(1482535, 6)

In [36]:
#add price to our df_train data frame
df_train['price'] = train.price

In [37]:
#apply log transform on the price for all values which are greater than 0, else return the value
df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)

In [38]:
df_train.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,price
0,0,916335,3,829,-1,1,2.302585
1,1,1292428,3,86,3889,0,3.951244
2,2,131013,1,1277,4588,1,2.302585
3,3,802671,1,503,-1,1,3.555348
4,4,65051,1,1204,-1,0,3.78419


In [41]:
#split our df_train dataset into x_train and y_train which could be use for training model
x_train,y_train = df_train.drop(['price'],axis =1),df_train.price

In [42]:
#creating a random forest regression model and check the score on train data
m = RandomForestRegressor(n_jobs=-1,min_samples_leaf=3,n_estimators=200)
m.fit(x_train, y_train)
m.score(x_train,y_train)

0.79315353358665308

In [43]:
#predict on test data
preds = m.predict(df_test)

In [44]:
#applying exponent on price
preds = pd.Series(np.exp(preds))

In [45]:
type(preds)

pandas.core.series.Series

In [47]:
#appending test_id and price that we have predicted
submit = pd.concat([df_test.id,preds],axis=1)
submit.columns = ['test_id','price']

In [48]:
#submit
submit.to_csv(curr_dir+"//Basic_RandomForest_pred.csv", index=False)