# Cleaning regression datasets and training model

### Imports

In [351]:
import pandas as pd
import os
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [352]:
while "notebooks" in os.getcwd():
    os.chdir("../")

In [353]:
do_training = True

### California Housing Prices Datset

In [354]:
DATA_DIR = Path('data/housing_data')
file_name = 'housing.csv'
scaled_file_name = 'housing_scaled.csv'
train_file_name = 'train_housing_scaled.csv'
test_file_name = 'test_housing_scaled.csv'
scaler_params_file = 'housing_scaling_params.csv'

In [355]:
MODEL_PATH = Path('models/')
housing_model_name = 'housing'

In [356]:
df = pd.read_csv(DATA_DIR / file_name)

In [357]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [358]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [359]:
df = df.drop(columns=['ocean_proximity'])

In [360]:
df.loc[:, df.columns == 'median_house_value']

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [363]:
def scale_split_df(_df: pd.DataFrame, prediction_column: str, test_size: float, data_dir: Path, 
                   scaled_df_name: str, train_df_name: str, test_df_name: str, scaler_params_name: str):
    
    scaler = StandardScaler()
    features_df = df.loc[:, df.columns != prediction_column]
    prediction_df = df.loc[:, df.columns == prediction_column]
    scaled_features = pd.DataFrame(scaler.fit_transform(features_df))
    scaled_features.columns = features_df.columns
    scaled_df = pd.concat([scaled_features, prediction_df], axis=1)
    train_df, test_df = train_test_split(scaled_df, test_size=test_size)
    
    scaled_df.to_csv(data_dir/scaled_df_name, index=False)
    train_df.to_csv(data_dir/train_df_name, index=False)
    test_df.to_csv(data_dir/test_df_name, index=False)
    
    normalization_params = {
    "mean": scaler.mean_,
    "variance": scaler.var_,
}

    normalization_params_df = pd.DataFrame.from_dict(normalization_params, orient="index")
    normalization_params_df.columns = features_df.columns
    normalization_params_df.to_csv(DATA_DIR/scaler_params_name)

    return scaled_df, train_df, test_df

In [364]:
scaled_df, train_df, test_df = scale_split_df(df, 'median_house_value', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-8.526513e-15,-1.079584e-15,5.508083e-18,3.2015730000000005e-17,-7.233049000000001e-17,-1.101617e-17,6.885104000000001e-17,6.6097e-17,206855.816909
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,115395.615874
min,-2.385992,-1.447568,-2.19618,-1.207283,-1.274093,-1.256123,-1.303984,-1.774299,14999.0
25%,-1.113209,-0.7967887,-0.8453931,-0.5445698,-0.5740034,-0.5638089,-0.5742294,-0.6881186,119600.0
50%,0.5389137,-0.6422871,0.02864572,-0.2332104,-0.2441308,-0.2291318,-0.2368162,-0.1767951,179700.0
75%,0.7784964,0.9729566,0.6643103,0.2348028,0.2589843,0.2644949,0.2758427,0.4593063,264725.0
max,2.62528,2.958068,1.856182,16.81558,14.01871,30.25033,14.60152,5.858286,500001.0


In [365]:
X_train = train_df.loc[:, df.columns != 'median_house_value']
y_train = train_df.loc[:, df.columns == 'median_house_value'] 
X_test = test_df.loc[:, df.columns != 'median_house_value']
y_test = test_df.loc[:, df.columns == 'median_house_value']

#### Housing model training

In [366]:
def train_and_save_model(param: dict, dtrain: xgb.DMatrix, dtest: xgb.DMatrix, steps, model_path: Path, model_name: str):
    if do_training:
        gbdt_model = xgb.train(param, dtrain,
                          num_boost_round=steps,
                          evals=[(dtest, 'test'), (dtrain, 'train')],
                          verbose_eval=50)
    if do_training:
        gbdt_model.dump_model(model_path / f"{model_name}_dumped.txt", with_stats=True)
        gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    else:
        gbdt_model = xgb.Booster()
        gbdt_model.load_model(model_path / f"{model_name}_saved.json")

In [367]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'seed': 42,
}
steps = 10000


In [368]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [369]:
%%time
train_and_save_model(param, dtrain, dtest, steps, MODEL_PATH, housing_model_name)

[0]	test-rmse:115957.03981	train-rmse:114197.18540
[50]	test-rmse:85619.59869	train-rmse:82474.10646
[100]	test-rmse:69749.51557	train-rmse:64696.92167
[150]	test-rmse:61476.58022	train-rmse:54453.02790
[200]	test-rmse:56643.74686	train-rmse:48119.80440
[250]	test-rmse:53833.27067	train-rmse:43964.56202
[300]	test-rmse:52199.30548	train-rmse:41049.34219
[350]	test-rmse:51010.34500	train-rmse:38866.55876
[400]	test-rmse:50208.14913	train-rmse:37186.38287
[450]	test-rmse:49417.29336	train-rmse:35502.25079
[500]	test-rmse:48838.24092	train-rmse:34137.65425
[550]	test-rmse:48429.91527	train-rmse:32980.47265
[600]	test-rmse:48133.98558	train-rmse:32055.87525
[650]	test-rmse:47930.02071	train-rmse:31243.93598
[700]	test-rmse:47802.24340	train-rmse:30548.13224
[750]	test-rmse:47661.86101	train-rmse:29896.13949
[800]	test-rmse:47589.33823	train-rmse:29332.07088
[850]	test-rmse:47520.74039	train-rmse:28751.42831
[900]	test-rmse:47463.49319	train-rmse:28185.80739
[950]	test-rmse:47396.98045	trai

### Red Wine Dataset

In [370]:
DATA_DIR = Path('data/wine_quality')
file_name = 'winequality_red.csv'
scaled_file_name = 'winequality_red_scaled.csv'
train_file_name = 'train_winequality_red_scaled.csv'
test_file_name = 'test_winequality_red_scaled.csv'
scaler_params_file = 'winequality_red_scaling_params.csv'
MODEL_PATH = Path('models/')
wine_model_name = 'winequality_red'

In [371]:
df = pd.read_csv(DATA_DIR / file_name, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [372]:
scaled_df, train_df, test_df = scale_split_df(df, 'quality', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,3.554936e-16,1.733031e-16,-8.887339000000001e-17,-1.244227e-16,3.732682e-16,-6.221137e-17,4.4436690000000005e-17,-3.473172e-14,2.861723e-15,6.754377e-16,1.066481e-16,5.636023
std,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,0.807569
min,-2.137045,-2.27828,-1.391472,-1.162696,-1.603945,-1.4225,-1.230584,-3.538731,-3.700401,-1.936507,-1.898919,3.0
25%,-0.7007187,-0.7699311,-0.9293181,-0.4532184,-0.371229,-0.8487156,-0.7440403,-0.6077557,-0.6551405,-0.6382196,-0.8663789,5.0
50%,-0.2410944,-0.04368911,-0.05636026,-0.240375,-0.1799455,-0.1793002,-0.2574968,0.001760083,-0.007212705,-0.2251281,-0.2093081,6.0
75%,0.5057952,0.6266881,0.7652471,0.04341614,0.05384542,0.4901152,0.4723184,0.5768249,0.5759223,0.4240158,0.6354971,6.0
max,4.355149,5.877976,3.743574,9.195681,11.12703,5.367284,7.375154,3.680055,4.528282,7.918677,4.202453,8.0


In [373]:
X_train = train_df.loc[:, df.columns != 'quality']
y_train = train_df.loc[:, df.columns == 'quality'] 
X_test = test_df.loc[:, df.columns != 'quality']
y_test = test_df.loc[:, df.columns == 'quality']

### Wine model training

In [374]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'seed': 42,
}
steps = 3000

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [375]:
train_and_save_model(param, dtrain, dtest, steps, MODEL_PATH, wine_model_name)

[0]	test-rmse:0.80256	train-rmse:0.80277
[50]	test-rmse:0.69013	train-rmse:0.60238
[100]	test-rmse:0.63072	train-rmse:0.47271
[150]	test-rmse:0.59414	train-rmse:0.38414
[200]	test-rmse:0.57838	train-rmse:0.32799
[250]	test-rmse:0.57031	train-rmse:0.28522
[300]	test-rmse:0.56352	train-rmse:0.25594
[350]	test-rmse:0.56089	train-rmse:0.22577
[400]	test-rmse:0.55953	train-rmse:0.20020
[450]	test-rmse:0.55883	train-rmse:0.18484
[500]	test-rmse:0.55856	train-rmse:0.17342
[550]	test-rmse:0.55812	train-rmse:0.16502
[600]	test-rmse:0.55758	train-rmse:0.16028
[650]	test-rmse:0.55631	train-rmse:0.15243
[700]	test-rmse:0.55649	train-rmse:0.14586
[750]	test-rmse:0.55596	train-rmse:0.13853
[800]	test-rmse:0.55602	train-rmse:0.12807
[850]	test-rmse:0.55585	train-rmse:0.12022
[900]	test-rmse:0.55609	train-rmse:0.11250
[950]	test-rmse:0.55568	train-rmse:0.10571
[1000]	test-rmse:0.55578	train-rmse:0.09890
[1050]	test-rmse:0.55615	train-rmse:0.09261
[1100]	test-rmse:0.55634	train-rmse:0.08823
[1150]	test