In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('winemag-data_first150k.csv', index_col=0)

In [3]:
print(df.shape)
print(df.dtypes)

(150930, 10)
country         object
description     object
designation     object
points           int64
price          float64
province        object
region_1        object
region_2        object
variety         object
winery          object
dtype: object


In [4]:
df.isnull().any()

country         True
description    False
designation     True
points         False
price           True
province        True
region_1        True
region_2        True
variety        False
winery         False
dtype: bool

In [5]:
df.price.unique()

array([ 235.,  110.,   90.,   65.,   66.,   73.,   60.,   80.,   48.,
        185.,  325.,  290.,   75.,   24.,   79.,  220.,   45.,   57.,
         62.,  105.,   15.,   37.,   nan,   22.,   42.,  135.,   29.,
         23.,   17.,   26.,   55.,   39.,   69.,   30.,   50.,   40.,
        100.,   68.,   28.,   18.,   25.,   36.,   38.,   85.,   19.,
         54.,   59.,   10.,   12.,   13.,   14.,   20.,  125.,    7.,
         49.,   93.,   32.,   16.,   21.,   44.,   35.,   61.,   34.,
         11.,   58.,  120.,   41.,  200.,   43.,    9.,   56.,   46.,
         92.,   94.,   27.,   95.,   33.,   70.,  155.,   63.,  130.,
        115.,   31.,   98.,   52.,   64.,   91.,  111.,   88.,   74.,
         77.,   87.,  113.,  140.,  500.,   51.,  150.,  240.,   72.,
        103.,    8.,  136.,  141.,  102.,   47.,   53.,   96.,   78.,
        149.,  138.,  117.,    4.,   99.,  848.,  450.,  330.,  175.,
         89.,   82.,   71.,  137.,  145.,  698.,  160.,  770.,  118.,
        238.,  202.,

In [6]:
df.price.isnull().sum()

13695

In [7]:
xy = df.copy()

In [8]:
xy.dropna(axis=0, subset=['price'], inplace=True)

In [9]:
xy.shape

(137235, 10)

In [10]:
xy.isnull().sum()

country            5
description        0
designation    42311
points             0
price              0
province           5
region_1       22842
region_2       76531
variety            0
winery             0
dtype: int64

In [11]:
xy.dropna(axis=0, subset=['country'], inplace=True)

In [12]:
xy.shape

(137230, 10)

In [13]:
xy.isnull().sum()

country            0
description        0
designation    42311
points             0
price              0
province           0
region_1       22837
region_2       76526
variety            0
winery             0
dtype: int64

In [14]:
# xy.groupby(by='country', axis=0).mean()

In [15]:
xy2 = xy.copy()
xy2.dropna(axis=1, inplace=True)
xy2.drop('description', axis=1, inplace=True)

In [16]:
xy2.shape

(137230, 6)

In [17]:
assert(xy2.isnull().any().sum()==0)

In [18]:
X = xy2.copy()
y = X.pop('price')

In [19]:
# Break down the dataset to train-valid-test
from sklearn.model_selection import train_test_split
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, test_size = 0.2)

cat_label = X_train.select_dtypes(object).columns

from sklearn.preprocessing import LabelEncoder
import bisect
le = LabelEncoder()

for col in cat_label:
    X_train.loc[:, (col)] = le.fit_transform(X_train.loc[:, col])
    le_cls = le.classes_.tolist()
    
    X_valid.loc[(~X_valid[col].isin(le_cls)), (col)] = 'Other'
    X_test.loc[(~X_test[col].isin(le_cls)), (col)] = 'Other' 
    bisect.insort_left(le_cls, 'Other')
    
    le.classes_ = le_cls
    X_valid.loc[:, (col)] = le.transform(X_valid[col])
    X_test.loc[:, (col)] = le.transform(X_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


# Decision Tree model

In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

def decision_tree(X_train, y_train, X_valid, y_valid, Max_depth_lst, min_split_list, min_leaf_list):
    if len(Max_depth_lst)==len(min_split_list) and len(min_split_list)==len(min_leaf_list):
        Model_error = []
        for i in range(len(Max_depth_lst)):
            dtc = DecisionTreeRegressor(criterion='mae', splitter='best', max_depth=Max_depth_lst[i], min_samples_split=min_split_list[i], min_samples_leaf=min_leaf_list[i])

            dtc.fit(X_train, y_train)
            y_val_pred = dtc.predict(X_valid)
            Model_error.append(mean_absolute_error(y_val_pred, y_valid))
        return Model_error
    else:
        return None

depth = [10, 20, 50, 50, 50, 100, 250]
split = [10, 10, 10,  6, 20,  10,  10]
leaf  = [ 3,  3,  3,  3,  6,   3,   3]

errors = decision_tree(X_train, y_train, X_valid, y_valid, depth, split, leaf)
print(errors)

[11.614860864416814, 11.09955822744455, 11.039304094366262, 10.9898210137997, 11.087010975998542, 10.998724780252312, 11.054675046682151]


In [21]:
dtc = DecisionTreeRegressor(criterion='mae', splitter='best', max_depth=50, min_samples_split=6, min_samples_leaf=3)
dtc.fit(X_train, y_train)
y_trn_pred = dtc.predict(X_train)
y_val_pred = dtc.predict(X_valid)
y_tst_pred = dtc.predict(X_test)
print('training error=',mean_absolute_error(y_trn_pred, y_train))
print('validation error=',mean_absolute_error(y_val_pred, y_valid))
print('test error=',mean_absolute_error(y_tst_pred, y_test))

training error= 5.977205187470823
validation error= 10.99517238238375
test error= 10.992494352546819


In [22]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=50, criterion='mae', max_depth=50, min_samples_split=6, min_samples_leaf=3)

rfr.fit(X_train, y_train)

y_trn_pred = rfr.predict(X_train)
y_val_pred = rfr.predict(X_valid)
y_tst_pred = rfr.predict(X_test)

print('training error=',mean_absolute_error(y_trn_pred, y_train))
print('validation error=',mean_absolute_error(y_val_pred, y_valid))
print('test error=',mean_absolute_error(y_tst_pred, y_test))

training error= 7.529742903662881
validation error= 10.192706198478847
test error= 10.139264373679225


In [23]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=300, early_stopping_rounds=4, learning_rate=0.1, n_jobs=4)

xgb.fit(X_train, y_train)

y_trn_pred = xgb.predict(X_train)
y_val_pred = xgb.predict(X_valid)
y_tst_pred = xgb.predict(X_test)

print('training error=',mean_absolute_error(y_trn_pred, y_train))
print('validation error=',mean_absolute_error(y_val_pred, y_valid))
print('test error=',mean_absolute_error(y_tst_pred, y_test))

Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






training error= 10.319584402544558
validation error= 11.548196421081308
test error= 11.604841276550488


In [36]:
from keras.models import Sequential
from keras.layers import Dense
import keras

seq_mod = Sequential(
    [
        Dense(10, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'),
        Dense(25, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'),
        Dense(50, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'),
        Dense(50, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'),
        Dense(20, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'),
        Dense(1)
    ])
seq_mod.compile(optimizer='adam', loss='MeanAbsoluteError', metrics=['MeanAbsoluteError'])
seq_mod.fit(X_train, y_train, epochs=100, batch_size=16, verbose=2)

_, seq_val_error = seq_mod.evaluate(X_valid, y_valid, verbose=0)
_, seq_tst_error = seq_mod.evaluate(X_test, y_test, verbose=0)

print('validation error=', seq_val_error)
print('test error=', seq_tst_error)

Epoch 1/100
5490/5490 - 4s - loss: 30.5860 - mean_absolute_error: 30.5860
Epoch 2/100
5490/5490 - 4s - loss: 17.9004 - mean_absolute_error: 17.9004
Epoch 3/100
5490/5490 - 4s - loss: 17.0347 - mean_absolute_error: 17.0347
Epoch 4/100
5490/5490 - 4s - loss: 16.8510 - mean_absolute_error: 16.8510
Epoch 5/100
5490/5490 - 4s - loss: 16.7091 - mean_absolute_error: 16.7091
Epoch 6/100
5490/5490 - 4s - loss: 16.6457 - mean_absolute_error: 16.6457
Epoch 7/100
5490/5490 - 4s - loss: 16.5207 - mean_absolute_error: 16.5207
Epoch 8/100
5490/5490 - 4s - loss: 16.3613 - mean_absolute_error: 16.3613
Epoch 9/100
5490/5490 - 4s - loss: 16.2521 - mean_absolute_error: 16.2521
Epoch 10/100
5490/5490 - 4s - loss: 16.2756 - mean_absolute_error: 16.2756
Epoch 11/100
5490/5490 - 4s - loss: 16.1877 - mean_absolute_error: 16.1877
Epoch 12/100
5490/5490 - 4s - loss: 16.0544 - mean_absolute_error: 16.0544
Epoch 13/100
5490/5490 - 4s - loss: 16.0304 - mean_absolute_error: 16.0304
Epoch 14/100
5490/5490 - 4s - loss