In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

*Here We are importing the data*

We have decided to import the train and the test wine data

In [2]:
wine_train=pd.read_csv('Datasets/train.csv')
wine_test=pd.read_csv('Datasets/test.csv')

**Head of test data and train data**

In [3]:
wine_test.head(1)

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,41855,US,"Sweet new oak stands out, giving this Cabernet...",3D,95.036469,,California,St. Helena,Napa,,,Salvestrin 2007 3D Cabernet Sauvignon (St. Hel...,CABERNET SAUVIGNON,Salvestrin,0


In [4]:
wine_train.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027


**Data train info**

In [5]:
wine_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175000 entries, 0 to 174999
Data columns (total 14 columns):
country                  174953 non-null object
description              175000 non-null object
designation              122734 non-null object
points                   175000 non-null float64
price                    175000 non-null float64
province                 174953 non-null object
region_1                 146466 non-null object
region_2                 75394 non-null object
taster_name              65509 non-null object
taster_twitter_handle    62190 non-null object
title                    82189 non-null object
variety                  174999 non-null object
winery                   175000 non-null object
id                       175000 non-null int64
dtypes: float64(2), int64(1), object(11)
memory usage: 18.7+ MB


Data Description

In [25]:
wine_train.describe()

Unnamed: 0,points,price,id
count,175000.0,175000.0,175000.0
mean,88.083987,34.3044,70684.04724
std,3.157001,38.398146,41341.638798
min,79.636128,4.0,1.0
25%,85.971283,16.0,35020.0
50%,87.981631,25.0,70256.5
75%,90.085631,40.0,105550.25
max,100.220603,2500.0,150929.0


**Install Package to use for**

In [6]:
!pip3 install catboost



The Model that we are going to implement is the cat Boost regression Model

In [7]:
from catboost import CatBoostRegressor

**Code to identify the missing values in our train dataset**

In [8]:
wine_train.isnull().sum()

country                      47
description                   0
designation               52266
points                        0
price                         0
province                     47
region_1                  28534
region_2                  99606
taster_name              109491
taster_twitter_handle    112810
title                     92811
variety                       1
winery                        0
id                            0
dtype: int64

In [9]:
wine_train.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027


In [41]:
wine_train['variety']=pd.Categorical(wine_train['variety'])
wine_train["variety"] = wine_train.variety.cat.codes


wine_train['province']=pd.Categorical(wine_train['province'])
wine_train["province"] = wine_train.province.cat.codes

wine_train['designation']=pd.Categorical(wine_train['designation'])
wine_train["designation"] = wine_train.designation.cat.codes

wine_train['country']=pd.Categorical(wine_train['country'])
wine_train["country"] = wine_train.country.cat.codes



wine_train['winery']=pd.Categorical(wine_train['winery'])
wine_train["winery"] = wine_train.winery.cat.codes

wine_train['description']=pd.Categorical(wine_train['description'])
wine_train["description"] = wine_train.description.cat.codes


#wine_train['region_1']=pd.Categorical(wine_train['region_1'])



Let us declare our model variable and the corresponding features

In [42]:
y=wine_train['price']
X=wine_train[['description','designation','points','province'
              ,'country','variety','winery']]
    
#X=wine_train[['points','winery','designation']]


Let's Train the model

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)



Building the model by using CatBoost package

In [44]:
model_cat = CatBoostRegressor()

In [45]:
model_cat

<catboost.core.CatBoostRegressor at 0x7f9b509c5160>

In [46]:
model_cat.fit(X_train,y_train)

0:	learn: 36.9858689	total: 59.9ms	remaining: 59.9s
1:	learn: 36.6187960	total: 70.8ms	remaining: 35.3s
2:	learn: 36.2819327	total: 81.8ms	remaining: 27.2s
3:	learn: 35.9503375	total: 91.8ms	remaining: 22.9s
4:	learn: 35.6792820	total: 101ms	remaining: 20.2s
5:	learn: 35.4200543	total: 112ms	remaining: 18.5s
6:	learn: 35.1393537	total: 123ms	remaining: 17.5s
7:	learn: 34.8949214	total: 133ms	remaining: 16.5s
8:	learn: 34.6293019	total: 141ms	remaining: 15.6s
9:	learn: 34.3678740	total: 149ms	remaining: 14.8s
10:	learn: 34.1473582	total: 156ms	remaining: 14s
11:	learn: 33.9112222	total: 163ms	remaining: 13.4s
12:	learn: 33.6805403	total: 171ms	remaining: 13s
13:	learn: 33.4659895	total: 178ms	remaining: 12.5s
14:	learn: 33.2701709	total: 186ms	remaining: 12.2s
15:	learn: 33.0760122	total: 193ms	remaining: 11.9s
16:	learn: 32.8860348	total: 201ms	remaining: 11.6s
17:	learn: 32.7055268	total: 209ms	remaining: 11.4s
18:	learn: 32.5370905	total: 215ms	remaining: 11.1s
19:	learn: 32.3626940	

179:	learn: 27.2489179	total: 1.88s	remaining: 8.57s
180:	learn: 27.2424168	total: 1.89s	remaining: 8.56s
181:	learn: 27.2383846	total: 1.9s	remaining: 8.54s
182:	learn: 27.2327306	total: 1.91s	remaining: 8.53s
183:	learn: 27.2266325	total: 1.92s	remaining: 8.53s
184:	learn: 27.2193513	total: 1.93s	remaining: 8.5s
185:	learn: 27.2102431	total: 1.94s	remaining: 8.48s
186:	learn: 27.2045834	total: 1.95s	remaining: 8.46s
187:	learn: 27.1963292	total: 1.95s	remaining: 8.43s
188:	learn: 27.1782009	total: 1.96s	remaining: 8.41s
189:	learn: 27.1621977	total: 1.97s	remaining: 8.38s
190:	learn: 27.1491140	total: 1.97s	remaining: 8.36s
191:	learn: 27.1418038	total: 1.98s	remaining: 8.34s
192:	learn: 27.1326132	total: 1.99s	remaining: 8.31s
193:	learn: 27.1234295	total: 2s	remaining: 8.29s
194:	learn: 27.1078129	total: 2s	remaining: 8.27s
195:	learn: 27.1005967	total: 2.01s	remaining: 8.24s
196:	learn: 27.0905256	total: 2.02s	remaining: 8.22s
197:	learn: 27.0856796	total: 2.02s	remaining: 8.2s
19

354:	learn: 26.0511630	total: 3.31s	remaining: 6.02s
355:	learn: 26.0450446	total: 3.32s	remaining: 6s
356:	learn: 26.0430259	total: 3.33s	remaining: 5.99s
357:	learn: 26.0406677	total: 3.34s	remaining: 5.99s
358:	learn: 26.0377844	total: 3.35s	remaining: 5.98s
359:	learn: 26.0359678	total: 3.36s	remaining: 5.97s
360:	learn: 26.0306950	total: 3.36s	remaining: 5.95s
361:	learn: 26.0255782	total: 3.37s	remaining: 5.94s
362:	learn: 26.0232840	total: 3.38s	remaining: 5.93s
363:	learn: 26.0198333	total: 3.39s	remaining: 5.92s
364:	learn: 26.0175042	total: 3.39s	remaining: 5.9s
365:	learn: 26.0129677	total: 3.4s	remaining: 5.89s
366:	learn: 26.0112723	total: 3.41s	remaining: 5.88s
367:	learn: 26.0047928	total: 3.42s	remaining: 5.87s
368:	learn: 25.9980113	total: 3.42s	remaining: 5.85s
369:	learn: 25.9859956	total: 3.43s	remaining: 5.84s
370:	learn: 25.9767171	total: 3.44s	remaining: 5.83s
371:	learn: 25.9732239	total: 3.45s	remaining: 5.82s
372:	learn: 25.9715332	total: 3.45s	remaining: 5.81

528:	learn: 25.2004828	total: 4.73s	remaining: 4.21s
529:	learn: 25.1960502	total: 4.74s	remaining: 4.21s
530:	learn: 25.1933976	total: 4.75s	remaining: 4.2s
531:	learn: 25.1849277	total: 4.76s	remaining: 4.19s
532:	learn: 25.1811007	total: 4.77s	remaining: 4.18s
533:	learn: 25.1765294	total: 4.78s	remaining: 4.17s
534:	learn: 25.1731582	total: 4.79s	remaining: 4.16s
535:	learn: 25.1687122	total: 4.79s	remaining: 4.15s
536:	learn: 25.1643413	total: 4.8s	remaining: 4.14s
537:	learn: 25.1610855	total: 4.81s	remaining: 4.13s
538:	learn: 25.1565889	total: 4.82s	remaining: 4.12s
539:	learn: 25.1541791	total: 4.82s	remaining: 4.11s
540:	learn: 25.1507369	total: 4.83s	remaining: 4.1s
541:	learn: 25.1478411	total: 4.84s	remaining: 4.09s
542:	learn: 25.1421680	total: 4.84s	remaining: 4.08s
543:	learn: 25.1388680	total: 4.85s	remaining: 4.07s
544:	learn: 25.1355734	total: 4.86s	remaining: 4.06s
545:	learn: 25.1331467	total: 4.87s	remaining: 4.05s
546:	learn: 25.1270947	total: 4.87s	remaining: 4.

708:	learn: 24.5578864	total: 6.14s	remaining: 2.52s
709:	learn: 24.5551948	total: 6.15s	remaining: 2.51s
710:	learn: 24.5519503	total: 6.16s	remaining: 2.5s
711:	learn: 24.5496716	total: 6.17s	remaining: 2.5s
712:	learn: 24.5459315	total: 6.18s	remaining: 2.49s
713:	learn: 24.5435522	total: 6.18s	remaining: 2.48s
714:	learn: 24.5419302	total: 6.19s	remaining: 2.47s
715:	learn: 24.5393101	total: 6.2s	remaining: 2.46s
716:	learn: 24.5385499	total: 6.21s	remaining: 2.45s
717:	learn: 24.5367017	total: 6.21s	remaining: 2.44s
718:	learn: 24.5345744	total: 6.22s	remaining: 2.43s
719:	learn: 24.5338102	total: 6.23s	remaining: 2.42s
720:	learn: 24.5322922	total: 6.24s	remaining: 2.41s
721:	learn: 24.5310774	total: 6.25s	remaining: 2.4s
722:	learn: 24.5300548	total: 6.25s	remaining: 2.4s
723:	learn: 24.5272000	total: 6.26s	remaining: 2.39s
724:	learn: 24.5242492	total: 6.27s	remaining: 2.38s
725:	learn: 24.5207892	total: 6.27s	remaining: 2.37s
726:	learn: 24.5165281	total: 6.28s	remaining: 2.36

867:	learn: 24.1246364	total: 7.37s	remaining: 1.12s
868:	learn: 24.1206383	total: 7.38s	remaining: 1.11s
869:	learn: 24.1186552	total: 7.39s	remaining: 1.1s
870:	learn: 24.1159303	total: 7.4s	remaining: 1.1s
871:	learn: 24.1102365	total: 7.41s	remaining: 1.09s
872:	learn: 24.1062960	total: 7.42s	remaining: 1.08s
873:	learn: 24.1053817	total: 7.42s	remaining: 1.07s
874:	learn: 24.1036101	total: 7.43s	remaining: 1.06s
875:	learn: 24.1020924	total: 7.44s	remaining: 1.05s
876:	learn: 24.0986729	total: 7.45s	remaining: 1.04s
877:	learn: 24.0967828	total: 7.45s	remaining: 1.03s
878:	learn: 24.0934008	total: 7.46s	remaining: 1.03s
879:	learn: 24.0922580	total: 7.47s	remaining: 1.02s
880:	learn: 24.0911901	total: 7.48s	remaining: 1.01s
881:	learn: 24.0896786	total: 7.48s	remaining: 1s
882:	learn: 24.0864216	total: 7.49s	remaining: 993ms
883:	learn: 24.0836978	total: 7.5s	remaining: 984ms
884:	learn: 24.0805269	total: 7.5s	remaining: 975ms
885:	learn: 24.0776133	total: 7.51s	remaining: 967ms
8

<catboost.core.CatBoostRegressor at 0x7f9b509c5160>

Model evaluation

In [54]:
preds = model_cat.predict(X_test)

preds
import numpy as np
np.shape(wine_train)
len(preds)

35000

In [48]:

from sklearn import metrics
print (metrics.mean_absolute_error(preds,y_test))

12.978613072058618
