# Make (Train | Test) Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Read the data

In [8]:
X = pd.read_csv('../data_root/raw/wine_dataset.csv', index_col=0)

In [10]:
X.shape

(10000, 13)

In [9]:
X.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Fragrances suggest hay, crushed tomato vine an...",Kirchleiten,90,30.0,Northeastern Italy,Alto Adige,,Kerin O’Keefe,@kerinokeefe,Tiefenbrunner 2012 Kirchleiten Sauvignon (Alto...,Sauvignon,Tiefenbrunner
1,France,"Packed with fruit and crisp acidity, this is a...",,87,22.0,Loire Valley,Sancerre,,Roger Voss,@vossroger,Bernard Reverdy et Fils 2014 Rosé (Sancerre),Rosé,Bernard Reverdy et Fils
2,Italy,"This easy, ruby-red wine displays fresh berry ...",,86,,Tuscany,Chianti Classico,,,,Dievole 2009 Chianti Classico,Sangiovese,Dievole
3,US,Pretty in violet and rose petals this is a low...,Horseshoe Bend Vineyard,92,50.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Davis Family 2012 Horseshoe Bend Vineyard Pino...,Pinot Noir,Davis Family
4,US,This golden wine confounds in a mix of wet sto...,Dutton Ranch,93,38.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Dutton-Goldfield 2013 Dutton Ranch Chardonnay ...,Chardonnay,Dutton-Goldfield


### Remove rows with missing target, separate target from predictors

In [11]:
X.dropna(axis=0, subset=['points'], inplace=True)
y = X.points              
X.drop(['points'], axis=1, inplace=True)

In [12]:
X.shape, y.shape

((10000, 12), (10000,))

In [13]:
X.head()

Unnamed: 0,country,description,designation,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Fragrances suggest hay, crushed tomato vine an...",Kirchleiten,30.0,Northeastern Italy,Alto Adige,,Kerin O’Keefe,@kerinokeefe,Tiefenbrunner 2012 Kirchleiten Sauvignon (Alto...,Sauvignon,Tiefenbrunner
1,France,"Packed with fruit and crisp acidity, this is a...",,22.0,Loire Valley,Sancerre,,Roger Voss,@vossroger,Bernard Reverdy et Fils 2014 Rosé (Sancerre),Rosé,Bernard Reverdy et Fils
2,Italy,"This easy, ruby-red wine displays fresh berry ...",,,Tuscany,Chianti Classico,,,,Dievole 2009 Chianti Classico,Sangiovese,Dievole
3,US,Pretty in violet and rose petals this is a low...,Horseshoe Bend Vineyard,50.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Davis Family 2012 Horseshoe Bend Vineyard Pino...,Pinot Noir,Davis Family
4,US,This golden wine confounds in a mix of wet sto...,Dutton Ranch,38.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Dutton-Goldfield 2013 Dutton Ranch Chardonnay ...,Chardonnay,Dutton-Goldfield


In [14]:
y.head()

0    90
1    87
2    86
3    92
4    93
Name: points, dtype: int64

### Break off validation set from training data

In [15]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [16]:
X_train_full.shape, X_valid_full.shape, y_train.shape, y_valid.shape

((8000, 12), (2000, 12), (8000,), (2000,))

### Drop high-cardinality and uncecessary features

High-cardinality features (number of unique values) can lead to poor model performance and memory issues when trying to fit the model because encoding can result in extremely large matrices of values.

In [30]:
# Select categorical columns with relatively low cardinality
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 1000 and 
                        X_train_full[cname].dtype == "object"]

Doing some research into the factors that influence wine quality, a large consensus centers around enviromental factors (i.e. climate, temperature, sunlight, soil etc...), so features about geographical location (i.e. country, province, region etc...) make sense.

In [31]:
low_cardinality_cols

['country',
 'province',
 'region_1',
 'region_2',
 'taster_name',
 'taster_twitter_handle',
 'variety']

`taster_twitter_handle` provides no additional information on top of `taster_name` so we'll remove it.

In [34]:
low_cardinality_cols.remove('taster_twitter_handle')
low_cardinality_cols

['country', 'province', 'region_1', 'region_2', 'taster_name', 'variety']

### Select numeric columns

In [35]:
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
numeric_cols

['price']

### Keep selected columns only

In [37]:
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [39]:
X_train.shape, X_valid.shape

((8000, 7), (2000, 7))

### One-hot encode the data and align the columns between the train and validation datasets

In [41]:
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

In [42]:
X_train.shape, X_valid.shape

((8000, 1267), (2000, 1267))

In [43]:
X_train.columns, X_valid.columns

(Index(['price', 'country_Argentina', 'country_Australia', 'country_Austria',
        'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile',
        'country_Croatia', 'country_Cyprus',
        ...
        'variety_Viognier-Marsanne', 'variety_Vitovska', 'variety_Viura',
        'variety_Viura-Chardonnay', 'variety_Weissburgunder',
        'variety_White Blend', 'variety_Xarel-lo', 'variety_Xinomavro',
        'variety_Zinfandel', 'variety_Zweigelt'],
       dtype='object', length=1267),
 Index(['price', 'country_Argentina', 'country_Australia', 'country_Austria',
        'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile',
        'country_Croatia', 'country_Cyprus',
        ...
        'variety_Viognier-Marsanne', 'variety_Vitovska', 'variety_Viura',
        'variety_Viura-Chardonnay', 'variety_Weissburgunder',
        'variety_White Blend', 'variety_Xarel-lo', 'variety_Xinomavro',
        'variety_Zinfandel', 'variety_Zweigelt'],
       dtype='o

# Train Model

XGBoost is considered one of the best machine learning libraries for predicting tabular data.  (Explain a bit about the algo?)

We will use the `XGBRegressor` to predict wine points.

In [44]:
from xgboost import XGBRegressor

(explain hyperparameters, use notes)

(set `verbose=False` for production)

In [73]:
# Define the model
model_1 = XGBRegressor(n_estimators=1000, learning_rate=0.05)

# Fit the model
model_1.fit(X_train, y_train, early_stopping_rounds=5,
            eval_set=[(X_valid, y_valid)], verbose=True)

[0]	validation_0-rmse:83.61224
[1]	validation_0-rmse:79.43694
[2]	validation_0-rmse:75.46937
[3]	validation_0-rmse:71.70147
[4]	validation_0-rmse:68.12328
[5]	validation_0-rmse:64.72417
[6]	validation_0-rmse:61.49339
[7]	validation_0-rmse:58.42688
[8]	validation_0-rmse:55.51317
[9]	validation_0-rmse:52.74454
[10]	validation_0-rmse:50.11608
[11]	validation_0-rmse:47.61934
[12]	validation_0-rmse:45.24560
[13]	validation_0-rmse:42.99186
[14]	validation_0-rmse:40.85226
[15]	validation_0-rmse:38.81837
[16]	validation_0-rmse:36.88747
[17]	validation_0-rmse:35.05310
[18]	validation_0-rmse:33.31047
[19]	validation_0-rmse:31.65555
[20]	validation_0-rmse:30.08478
[21]	validation_0-rmse:28.59279
[22]	validation_0-rmse:27.17541
[23]	validation_0-rmse:25.82993
[24]	validation_0-rmse:24.55192
[25]	validation_0-rmse:23.33959
[26]	validation_0-rmse:22.18844
[27]	validation_0-rmse:21.09504
[28]	validation_0-rmse:20.05732
[29]	validation_0-rmse:19.07234
[30]	validation_0-rmse:18.13786
[31]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluate Model

In [74]:
from sklearn.metrics import mean_absolute_error

# Get predictions
predictions_1 = model_1.predict(X_valid)

# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid) # Your code here
print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 1.8436909980773926


In [75]:
compare_1 = pd.DataFrame({'actual': y_valid, 'predict': predictions_1})
compare_1.head(20)

Unnamed: 0,actual,predict
9394,90,89.47319
898,89,88.48243
2398,84,85.651123
5906,90,89.036766
2343,84,88.168869
8225,88,87.790184
5506,86,90.451157
6451,91,90.408325
2670,84,86.473618
3497,91,88.6063


In [79]:
compare_1.sort_values('predict').iloc[:20]

Unnamed: 0,actual,predict
4388,86,83.520111
9477,84,83.935944
8204,86,83.963394
6726,84,84.061501
5091,82,84.208778
196,83,84.329315
4138,84,84.349823
9549,86,84.393311
5838,85,84.393311
6450,85,84.447937
