In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

*Here We are importing the data*

We have decided to import the train and the test wine data

In [2]:
wine_train=pd.read_csv('Datasets/train.csv')
wine_test=pd.read_csv('Datasets/test.csv')

**Head of test data and train data**

In [3]:
wine_test.head(1)

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,41855,US,"Sweet new oak stands out, giving this Cabernet...",3D,95.036469,,California,St. Helena,Napa,,,Salvestrin 2007 3D Cabernet Sauvignon (St. Hel...,CABERNET SAUVIGNON,Salvestrin,0


In [4]:
wine_train.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027


**Data train info**

In [5]:
wine_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 14 columns):
country                  174953 non-null object
description              175000 non-null object
designation              122734 non-null object
points                   175000 non-null float64
price                    175000 non-null float64
province                 174953 non-null object
region_1                 146466 non-null object
region_2                 75394 non-null object
taster_name              65509 non-null object
taster_twitter_handle    62190 non-null object
title                    82189 non-null object
variety                  174999 non-null object
winery                   175000 non-null object
id                       175000 non-null int64
dtypes: float64(2), int64(1), object(11)
memory usage: 18.7+ MB


Data Description

In [6]:
wine_train.describe()

Unnamed: 0,points,price,id
count,175000.0,175000.0,175000.0
mean,88.083987,34.3044,70684.04724
std,3.157001,38.398146,41341.638798
min,79.636128,4.0,1.0
25%,85.971283,16.0,35020.0
50%,87.981631,25.0,70256.5
75%,90.085631,40.0,105550.25
max,100.220603,2500.0,150929.0


**Install Package to use for**

In [7]:
!pip install catboost

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


The Model that we are going to implement is the cat Boost regression Model

In [8]:
from catboost import CatBoostRegressor

**Code to identify the missing values in our train dataset**

In [9]:
wine_train.isnull().sum()

country                      47
description                   0
designation               52266
points                        0
price                         0
province                     47
region_1                  28534
region_2                  99606
taster_name              109491
taster_twitter_handle    112810
title                     92811
variety                       1
winery                        0
id                            0
dtype: int64

In [10]:
wine_train.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027


In [11]:
wine_train['variety']=pd.Categorical(wine_train['variety'])
wine_train["variety"] = wine_train.variety.cat.codes


wine_train['province']=pd.Categorical(wine_train['province'])
wine_train["province"] = wine_train.province.cat.codes

wine_train['designation']=pd.Categorical(wine_train['designation'])
wine_train["designation"] = wine_train.designation.cat.codes

wine_train['country']=pd.Categorical(wine_train['country'])
wine_train["country"] = wine_train.country.cat.codes



wine_train['winery']=pd.Categorical(wine_train['winery'])
wine_train["winery"] = wine_train.winery.cat.codes

wine_train['description']=pd.Categorical(wine_train['description'])
wine_train["description"] = wine_train.description.cat.codes


#wine_train['region_1']=pd.Categorical(wine_train['region_1'])



Let us declare our model variable and the corresponding features

In [12]:
y=wine_train['price']
X=wine_train[['description','designation','points','province'
              ,'country','variety','winery']]
    
#X=wine_train[['points','winery','designation']]


Let's Train the model

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

Building the model by using CatBoost package

In [14]:
model_cat = CatBoostRegressor()

In [15]:
model_cat

<catboost.core.CatBoostRegressor at 0x7f099e25f668>

In [16]:
model_cat.fit(X_train,y_train)

0:	learn: 36.9858689	total: 87.5ms	remaining: 1m 27s
1:	learn: 36.6187960	total: 108ms	remaining: 53.7s
2:	learn: 36.2819327	total: 124ms	remaining: 41.3s
3:	learn: 35.9503375	total: 143ms	remaining: 35.6s
4:	learn: 35.6792820	total: 154ms	remaining: 30.6s
5:	learn: 35.4200543	total: 172ms	remaining: 28.5s
6:	learn: 35.1393537	total: 186ms	remaining: 26.4s
7:	learn: 34.8949214	total: 204ms	remaining: 25.3s
8:	learn: 34.6293019	total: 244ms	remaining: 26.9s
9:	learn: 34.3678740	total: 256ms	remaining: 25.3s
10:	learn: 34.1473582	total: 267ms	remaining: 24s
11:	learn: 33.9112222	total: 278ms	remaining: 22.9s
12:	learn: 33.6805403	total: 290ms	remaining: 22s
13:	learn: 33.4659895	total: 303ms	remaining: 21.3s
14:	learn: 33.2701709	total: 320ms	remaining: 21s
15:	learn: 33.0760122	total: 334ms	remaining: 20.6s
16:	learn: 32.8860348	total: 348ms	remaining: 20.1s
17:	learn: 32.7055268	total: 360ms	remaining: 19.7s
18:	learn: 32.5370905	total: 372ms	remaining: 19.2s
19:	learn: 32.3626940	tota

<catboost.core.CatBoostRegressor at 0x7f099e25f668>

Model evaluation

In [17]:
preds = model_cat.predict(X_test)

preds
import numpy as np
np.shape(wine_train)
len(preds)

35000

In [18]:

from sklearn import metrics
print (metrics.mean_absolute_error(preds,y_test))

12.978613072058618


# Testing

In [19]:
wine_test['variety']=pd.Categorical(wine_test['variety'])
wine_test["variety"] = wine_test.variety.cat.codes


wine_test['province']=pd.Categorical(wine_test['province'])
wine_test["province"] = wine_test.province.cat.codes

wine_test['designation']=pd.Categorical(wine_test['designation'])
wine_test["designation"] = wine_test.designation.cat.codes

wine_test['country']=pd.Categorical(wine_test['country'])
wine_test["country"] = wine_test.country.cat.codes



wine_test['winery']=pd.Categorical(wine_test['winery'])
wine_test["winery"] = wine_test.winery.cat.codes

wine_test['description']=pd.Categorical(wine_test['description'])
wine_test["description"] = wine_test.description.cat.codes


#wine_train['region_1']=pd.Categorical(wine_train['region_1'])



In [20]:
wine_test_selected = wine_test[['description','designation','points','province'
              ,'country','variety','winery']]
    
#X=wine_train[['points','winery','designation']]


In [21]:
preds = model_cat.predict(wine_test_selected)

preds

array([137.84899507,  41.03124537,  45.4742882 , ...,  37.87144517,
        21.35048687,  54.29004061])

In [55]:
wine_test.id

0            0
1            1
2            2
3            3
4            4
         ...  
83205    83205
83206    83206
83207    83207
83208    83208
83209    83209
Name: id, Length: 83210, dtype: int64

In [25]:
type(preds)#.size

numpy.ndarray

In [52]:
preds.reshape(-1,1).shape

(83210, 1)

In [65]:
output = wine_test[['id']]
output['price']=preds.reshape(-1,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [66]:
output.head()

Unnamed: 0,id,price
0,0,137.848995
1,1,41.031245
2,2,45.474288
3,3,44.536009
4,4,22.587035


In [68]:
output.to_csv('final_output.csv',index=False)

In [35]:
!ls

 catboost_info		      Datasets	    'Group 1.ipynb'   README.md
 Cat_boost_Regression.ipynb   final_output   LICENSE
