In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import string
import re

In [2]:
from sklearn import decomposition
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
#read data from kaggle
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv', sep='\t')

In [3]:
#read data for local machine
curr_dir= os.getcwd()
train = pd.read_csv(curr_dir+"//Dataset/train.tsv", sep='\t')
test = pd.read_csv(curr_dir+"//Dataset/test.tsv",sep='\t')

In [4]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [6]:
def cat_split(row):
    try:
        text = row
        txt1, txt2, txt3 = text.split('/')
        return txt1, txt2, txt3
    except:
        return np.nan, np.nan, np.nan

train["cat_1"], train["cat_2"], train["cat_3"] = zip(*train.category_name.apply(lambda val: cat_split(val)))
test["cat_1"], test["cat_2"], test["cat_3"] = zip(*test.category_name.apply(lambda val: cat_split(val)))


In [7]:
train.info()
print("------------------------------------------")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 11 columns):
train_id             1482535 non-null int64
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
cat_1                1471819 non-null object
cat_2                1471819 non-null object
cat_3                1471819 non-null object
dtypes: float64(1), int64(3), object(7)
memory usage: 124.4+ MB
------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 10 columns):
test_id              693359 non-null int64
name                 693359 non-null object
item_condition_id    693359 non-null int64
category_name        690301 non-null object
bra

In [8]:
print(train.isnull().sum())
print ("---------------------------")
test.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
cat_1                 10716
cat_2                 10716
cat_3                 10716
dtype: int64
---------------------------


test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
cat_1                  5104
cat_2                  5104
cat_3                  5104
dtype: int64

In [9]:
train.drop(['category_name'], axis=1, inplace = True)
test.drop(['category_name'], axis=1, inplace = True)

In [10]:
train = train.rename(columns = {'train_id':'id'})
test = test.rename(columns = {'test_id':'id'})

In [11]:
train['is_train'] = 1
test['is_train'] = 0

In [20]:
train_test_combine = pd.concat([train.drop(['price'],axis =1),test],axis = 0)

In [51]:
train_test_combine.head()

Unnamed: 0,id,name,item_condition_id,brand_name,shipping,item_description,is_train,cat_1_Beauty,cat_1_Electronics,cat_1_Handmade,...,cat_3_Women,cat_3_Women's Golf Clubs,cat_3_Wool,cat_3_Work & Safety,cat_3_Wrap,cat_3_Wristlet,cat_3_Writing,cat_3_Yarn,cat_3_Yoga & Pilates,cat_3_Zipper
691450,691450,Ulta HD soft focus foundation,3,Ulta,0,Not my color. LIGHT COOL Used a tiny dot of th...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3217,3217,Tartelette in bloom,3,Tarte,1,Only used twice. It is 100% authentic.,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
974292,974292,❤LIQUIDATING! TOO FACED MELTED LIPPY SET,1,Too Faced,1,PLEASE READ MY BIO BEFORE BUYING FROM MY CLOSE...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
315086,315086,Travel Size Setting Powder,1,Dermablend,1,This is a loose translucent powder that sets m...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
315085,315085,4 Too Faced Lipstick bundle,4,Too Faced,0,You get all 4. Usage is a clearly as shown. I ...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
train_test_combine['cat_3'].describe()

count                     2160074
unique                        876
top       Pants, Tights, Leggings
freq                        88077
Name: cat_3, dtype: object

In [23]:
train_test_combine = train_test_combine.sort_values("cat_1")

In [29]:
cat_1_list = list(train_test_combine['cat_1'].unique())

In [30]:
cat_1_list

['Beauty',
 'Electronics',
 'Handmade',
 'Home',
 'Kids',
 'Men',
 'Other',
 'Sports & Outdoors',
 'Vintage & Collectibles',
 'Women',
 nan]

In [None]:
df_1 = train_test_combine[train_test_combine['cat_1'] == 'Beauty']
df_2 = train_test_combine[train_test_combine['cat_1'] == 'Electronics']
df_3 = train_test_combine[train_test_combine['cat_1'] == 'Handmade']
df_4 = train_test_combine[train_test_combine['cat_1'] == 'Home']
df_5 = train_test_combine[train_test_combine['cat_1'] == 'Kids']
df_6 = train_test_combine[train_test_combine['cat_1'] == 'Other']
df_7 = train_test_combine[train_test_combine['cat_1'] == 'Sports & Outdoors']
df_8 = train_test_combine[train_test_combine['cat_1'] == 'Vintage & Collectibles']
df_9 = train_test_combine[train_test_combine['cat_1'] == 'Women']
df_10 = train_test_combine[train_test_combine['cat_1'].isnull()]

In [57]:
df_1

Unnamed: 0,id,name,item_condition_id,brand_name,shipping,item_description,is_train,cat_1_Beauty,cat_2_Bath & Body,cat_2_Fragrance,...,cat_3_Shampoo & Conditioner Sets,cat_3_Shampoo Plus Conditioner,cat_3_Shampoos,cat_3_Styling Products,cat_3_Styling Tools,cat_3_Sun,cat_3_Toiletry Kits,cat_3_Tweezers,cat_3_Waxing,cat_3_Women
691450,691450,Ulta HD soft focus foundation,3,Ulta,0,Not my color. LIGHT COOL Used a tiny dot of th...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3217,3217,Tartelette in bloom,3,Tarte,1,Only used twice. It is 100% authentic.,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
974292,974292,❤LIQUIDATING! TOO FACED MELTED LIPPY SET,1,Too Faced,1,PLEASE READ MY BIO BEFORE BUYING FROM MY CLOSE...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
315086,315086,Travel Size Setting Powder,1,Dermablend,1,This is a loose translucent powder that sets m...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
315085,315085,4 Too Faced Lipstick bundle,4,Too Faced,0,You get all 4. Usage is a clearly as shown. I ...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1174061,1174061,Caramel Sugar Body Scrub,1,,1,New and sealed. Caramel sugar body scrub. Soft...,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1395559,1395559,Clinique pep-start eye cream 3ml,1,Clinique,1,Brand new never used Price is firm Bundle to s...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
716984,716984,KKW x Kylie creme lipsticks,3,Kylie Cosmetics,0,Each color swatched once. Shades were too ligh...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
809896,809896,NEW! LA Girl Pro Concealer - Pure Beige,1,,1,"Brand new, never opened. Perfect with conceali...",1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
390160,390160,Bindi,1,,1,"Brand new, it's colorful bindi's. It's 5 in ea...",0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
#one hot encoding
train_test_combine = pd.get_dummies(train_test_combine, columns=["cat_1"])
train_test_combine = pd.get_dummies(train_test_combine, columns=["cat_2"])
train_test_combine = pd.get_dummies(train_test_combine, columns=["cat_3"])


In [56]:
#one hot encoding on one dataset
df_1 = pd.get_dummies(df_1, columns=["cat_1"])
df_1 = pd.get_dummies(df_1, columns=["cat_2"])
df_1 = pd.get_dummies(df_1, columns=["cat_3"])

In [27]:
train_test_combine.cat_1 = train_test_combine.cat_1.astype('category')
train_test_combine.cat_2 = train_test_combine.cat_2.astype('category')
train_test_combine.cat_3 = train_test_combine.cat_3.astype('category')

In [28]:
#Label Encoding
train_test_combine.cat_1 = train_test_combine.cat_1.cat.codes
train_test_combine.cat_2 = train_test_combine.cat_2.cat.codes
train_test_combine.cat_3 = train_test_combine.cat_3.cat.codes

In [None]:
train_test_combine.shape

In [30]:
train_test_combine = train_test_combine.drop(['item_description'],axis = 1)

In [None]:
#train_test_combine['brand_name'] = train_test_combine['brand_name'].fillna("No_brand")

In [31]:
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')

In [32]:
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.name = train_test_combine.name.cat.codes

In [33]:
df_test = train_test_combine.loc[train_test_combine['is_train']==0]
df_train = train_test_combine.loc[train_test_combine['is_train']==1]

In [34]:
df_test = df_test.drop(['is_train'],axis=1)
df_train = df_train.drop(['is_train'],axis=1)

In [35]:
df_train['price'] = train.price

In [36]:
df_train.head()

Unnamed: 0,id,name,item_condition_id,brand_name,shipping,cat_1,cat_2,cat_3,price
0,0,916335,3,-1,1,5,102,769,10.0
1,1,1292428,3,3889,0,1,30,215,52.0
2,2,131013,1,4588,1,9,103,97,10.0
3,3,802671,1,-1,1,3,55,407,35.0
4,4,65051,1,-1,0,9,58,538,44.0


In [37]:
df_test.head()

Unnamed: 0,id,name,item_condition_id,brand_name,shipping,cat_1,cat_2,cat_3
0,0,323913,1,-1,1,9,58,662
1,1,65692,1,-1,1,6,71,696
2,2,410310,1,1094,1,8,7,379
3,3,544668,2,-1,0,9,96,166
4,4,810652,3,-1,1,6,14,657


In [38]:
df_train['price'] = df_train['price'].apply(lambda x: np.log(x) if x>0 else x)

In [39]:
x_train,y_train = df_train.drop(['price'],axis =1),df_train.price

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
#convert it to numpy arrays
X=df_train.values

#Scaling the values
X = scale(X)

pca = PCA(n_components=44)

pca.fit(X)

#The amount of variance that each PC explains
var= pca.explained_variance_ratio_

#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
m = RandomForestRegressor(n_jobs=-1,min_samples_leaf=3,n_estimators=200)
m.fit(x_train, y_train)
m.score(x_train,y_train)

In [None]:
preds = m.predict(df_test)

In [None]:
preds = pd.Series(np.exp(preds))

In [None]:
type(preds)

In [None]:
submit = pd.concat([df_test.id,preds],axis=1)

In [None]:
submit.columns = ['test_id','price']