In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from catboost import CatBoostRegressor

*Here We are importing the data*

We have decided to import the train and the test wine data

In [25]:
wine_train=pd.read_csv('Datasets/train.csv')
wine_test=pd.read_csv('Datasets/test.csv')

**Head of test data and train data**

In [26]:
wine_test.head(1)

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,41855,US,"Sweet new oak stands out, giving this Cabernet...",3D,95.036469,,California,St. Helena,Napa,,,Salvestrin 2007 3D Cabernet Sauvignon (St. Hel...,CABERNET SAUVIGNON,Salvestrin,0


In [27]:
wine_train.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027


**Data train info**

In [28]:
wine_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 14 columns):
country                  174953 non-null object
description              175000 non-null object
designation              122734 non-null object
points                   175000 non-null float64
price                    175000 non-null float64
province                 174953 non-null object
region_1                 146466 non-null object
region_2                 75394 non-null object
taster_name              65509 non-null object
taster_twitter_handle    62190 non-null object
title                    82189 non-null object
variety                  174999 non-null object
winery                   175000 non-null object
id                       175000 non-null int64
dtypes: float64(2), int64(1), object(11)
memory usage: 18.7+ MB


Data Description

In [29]:
wine_train.describe()

Unnamed: 0,points,price,id
count,175000.0,175000.0,175000.0
mean,88.083987,34.3044,70684.04724
std,3.157001,38.398146,41341.638798
min,79.636128,4.0,1.0
25%,85.971283,16.0,35020.0
50%,87.981631,25.0,70256.5
75%,90.085631,40.0,105550.25
max,100.220603,2500.0,150929.0


**Install Package to use for**

In [None]:
!pip install catboost

The Model that we are going to implement is the cat Boost regression Model

**Code to identify the missing values in our train dataset**

In [None]:
wine_train.isnull().sum()

In [None]:
wine_train.head(1)

In [None]:
wine_train[]

In [24]:
# wine_train.columns[wine_train.dtypes == 'object']

Index([], dtype='object')

In [31]:
# convert categorical columns to integers
category_cols = wine_train.columns[wine_train.dtypes == 'object']
for header in category_cols:
    wine_train[header] = wine_train[header].astype('category').cat.codes
    wine_test[header] = wine_test[header].astype('category').cat.codes

In [8]:
# wine_train['country']=pd.Categorical(wine_train['country'])
# wine_train["country"] = wine_train.country.cat.codes

# wine_train['description']=pd.Categorical(wine_train['description'])
# wine_train["description"] = wine_train.description.cat.codes

# wine_train['designation']=pd.Categorical(wine_train['designation'])
# wine_train["designation"] = wine_train.designation.cat.codes

# wine_train['province']=pd.Categorical(wine_train['province'])
# wine_train["province"] = wine_train.province.cat.codes

# wine_train['region_1']=pd.Categorical(wine_train['region_1'])
# wine_train["region_1"] = wine_train.region_1.cat.codes

# wine_train['region_2']=pd.Categorical(wine_train['region_2'])
# wine_train["region_2"] = wine_train.region_2.cat.codes


# wine_train['taster_name']=pd.Categorical(wine_train['taster_name'])
# wine_train["taster_name"] = wine_train.taster_name.cat.codes

# wine_train['taster_twitter_handle']=pd.Categorical(wine_train['taster_twitter_handle'])
# wine_train["taster_twitter_handle"] = wine_train.taster_twitter_handle.cat.codes

# wine_train['title']=pd.Categorical(wine_train['title'])
# wine_train["title"] = wine_train.title.cat.codes

# wine_train['variety']=pd.Categorical(wine_train['variety'])
# wine_train["variety"] = wine_train.variety.cat.codes

# wine_train['winery']=pd.Categorical(wine_train['winery'])
# wine_train["winery"] = wine_train.winery.cat.codes

Let us declare our model variable and the corresponding features

In [95]:
y = wine_train['price']
# X=wine_train[['country', 'description','designation',
#               'province', 'region_1', 'region_2', 'taster_name',
#               'taster_twitter_handle', 'title', 'variety', 'winery']]
    
category_cols = ['description','designation','points','province'
              ,'country','variety','winery', 'region_1', 'region_2']

X = wine_train[category_cols]

Let's Train the model

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

Building the model by using CatBoost package

In [145]:
model_cat = CatBoostRegressor(iterations=10300,
                              learning_rate=0.03)

In [146]:
cat_dims = [X_train.columns.get_loc(i) for i in category_cols[:-1]] 

In [147]:
model_cat.fit(X, y, silent=True) #, cat_features=cat_dims

<catboost.core.CatBoostRegressor at 0x7f58eab92a90>

In [148]:
# Total Training 
# model_cat.fit(X, y, silent=True)

Model evaluation

In [150]:
preds = model_cat.predict(X_test)

# preds
# import numpy as np
# np.shape(wine_train)
# len(preds)

# 32.68700878192631 (Best)

In [151]:

from sklearn import metrics
print (np.sqrt(metrics.mean_squared_error(preds,y_test)))

17.0654985906379


## Grid search

In [70]:
from paramsearch import paramsearch
from itertools import product,chain
from sklearn.model_selection import KFold

In [71]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'ctr_border_count':[50,5,10,20,100,200],
          'thread_count':4}

In [72]:
# this function does 3-fold crossvalidation with catboostclassifier          
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.ix[train_index]
        test_labels = train_label.ix[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [74]:
# this function runs grid search on several parameters
# def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
#     ps = paramsearch(params)
#     # search 'border_count', 'l2_leaf_reg' etc. individually 
#     #   but 'iterations','learning_rate' together
#     for prms in chain(ps.grid_search(['border_count']),
#                       ps.grid_search(['ctr_border_count']),
#                       ps.grid_search(['l2_leaf_reg']),
#                       ps.grid_search(['iterations','learning_rate']),
#                       ps.grid_search(['depth'])):
#         res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
#         # save the crossvalidation result so that future iterations can reuse the best parameters
#         ps.register_result(res,prms)
#         print(res,prms,s,'best:',ps.bestscore(),ps.bestparam())
#     return ps.bestparam()

# bestparams = catboost_param_tune(params,X_train,y_train)

# Xboost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBRegressor()

xgb_model.fit(X_train, y_train)
# xgb_model.fit(X, y)


y_pred_xgb = xgb_model.predict(X_test)

In [None]:
print (np.sqrt(metrics.mean_squared_error(y_pred_xgb,y_test)))

# Testing

In [None]:
wine_test['country']=pd.Categorical(wine_test['country'])
wine_test["country"] = wine_test.country.cat.codes

wine_test['description']=pd.Categorical(wine_test['description'])
wine_test["description"] = wine_test.description.cat.codes

wine_test['designation']=pd.Categorical(wine_test['designation'])
wine_test["designation"] = wine_test.designation.cat.codes

wine_test['province']=pd.Categorical(wine_test['province'])
wine_test["province"] = wine_test.province.cat.codes

wine_test['region_1']=pd.Categorical(wine_test['region_1'])
wine_test["region_1"] = wine_test.region_1.cat.codes

wine_test['region_2']=pd.Categorical(wine_test['region_2'])
wine_test["region_2"] = wine_test.region_2.cat.codes


wine_test['taster_name']=pd.Categorical(wine_test['taster_name'])
wine_test["taster_name"] = wine_test.taster_name.cat.codes

wine_test['taster_twitter_handle']=pd.Categorical(wine_test['taster_twitter_handle'])
wine_test["taster_twitter_handle"] = wine_test.taster_twitter_handle.cat.codes

wine_test['title']=pd.Categorical(wine_test['title'])
wine_test["title"] = wine_test.title.cat.codes

wine_test['variety']=pd.Categorical(wine_test['variety'])
wine_test["variety"] = wine_test.variety.cat.codes

wine_test['winery']=pd.Categorical(wine_test['winery'])
wine_test["winery"] = wine_test.winery.cat.codes

In [152]:
# wine_test_selected = wine_test[['description','designation','points','province'
#               ,'country','variety','winery']]

# category_cols = ['description','designation','points','province'
#               ,'country','variety','winery', 'region_1', 'region_2']

wine_test_selected = wine_test[category_cols]

# wine_test_selected = wine_test[['description','designation','points','province'
#               ,'country','variety','winery', 'region_1', 'region_2']]

In [153]:
preds = model_cat.predict(wine_test_selected)

preds

array([154.91697059,  47.18305977,  39.66869159, ...,  30.68191006,
        19.248977  ,  51.97490106])

In [154]:
wine_test.id

0            0
1            1
2            2
3            3
4            4
         ...  
83205    83205
83206    83206
83207    83207
83208    83208
83209    83209
Name: id, Length: 83210, dtype: int64

In [155]:
type(preds)#.size

numpy.ndarray

In [156]:
preds.reshape(-1,1).shape

(83210, 1)

In [157]:
output = wine_test[['id']]
output['price']=preds.reshape(-1,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [158]:
output.head()

Unnamed: 0,id,price
0,0,154.916971
1,1,47.18306
2,2,39.668692
3,3,55.718496
4,4,21.151365


In [159]:
output.to_csv('final_output_10300.csv',index=False)

In [None]:
!ls