https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn

# STEP 1&2 Envritonemtn and Importing Packages

In [1]:
import numpy as np
np.set_printoptions(suppress=True) # this is to get rid of scientific notation such as 1.16664562e-16. 

import pandas as pd
from sklearn.model_selection import train_test_split

# this is for scaline the dataset
from sklearn import preprocessing 

families?? we will come back to this at STEP 7

In [2]:
#  import the random forest family
from sklearn.ensemble import RandomForestRegressor

# tools to help us perform cross-validation.
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# some metrics we can use to evaluate our model performance later
from sklearn.metrics import mean_squared_error, r2_score

#mport a way to persist our model for future use
import joblib

# STEP 3: Loeading Data 

In [3]:
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [4]:
print(data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [5]:
######### it looks like the CSV file is actually using semicolons to separate the data. 
######### to fix this issue...

In [6]:
data = pd.read_csv(dataset_url, sep=';')
 
print( data.head() )

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [7]:
data.shape

(1599, 12)

In [8]:
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [10]:
####### All of the features are numeric, which is convenient. 
####### However, they have some very different scales, so let’s make a mental note to standardize the data later.

# STEP4: split datasets

### First, let’s separate our target (y) features from our input (X) features:

In [11]:
y = data.quality
X = data.drop('quality', axis=1)

In [12]:
# Scikit-Learn’s useful train_test_split function:  
### this is a fucntion of sklaen. it splits the original array and creates a new array from the original array
# we’ll set aside 20% of the data as a test set for evaluating our model
# We also set an arbitrary “random state” (a.k.a. seed) so that we can reproduce our results

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)



# STEP5: Decleate Data Preprocessing Steps

## scaling method 1 : NOTE THAT WE DO NOT USE THIS METHOD

In [13]:
print(X_train)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
691             9.2             0.920         0.24             2.6      0.087   
1475            5.3             0.470         0.11             2.2      0.048   
1065            7.7             0.610         0.18             2.4      0.083   
1159           10.2             0.410         0.43             2.2      0.110   
227             9.0             0.820         0.14             2.6      0.089   
...             ...               ...          ...             ...        ...   
463             8.1             0.660         0.70             2.2      0.098   
558            10.9             0.530         0.49             4.6      0.118   
1094            6.6             0.725         0.09             5.5      0.117   
792             7.1             0.610         0.02             2.5      0.081   
381            13.7             0.415         0.68             2.9      0.085   

      free sulfur dioxide  

In [14]:
# Using "preprocessing", scaling a dataset
X_train_scaled = preprocessing.scale(X_train)
print( X_train_scaled )

[[ 0.51358886  2.19680282 -0.164433   ...  1.08415147 -0.69866131
  -0.58608178]
 [-1.73698885 -0.31792985 -0.82867679 ...  1.46964764  1.2491516
   2.97009781]
 [-0.35201795  0.46443143 -0.47100705 ... -0.13658641 -0.35492962
  -0.20843439]
 ...
 [-0.98679628  1.10708533 -0.93086814 ...  0.24890976 -0.98510439
   0.35803669]
 [-0.69826067  0.46443143 -1.28853787 ...  1.08415147 -0.35492962
  -0.68049363]
 [ 3.1104093  -0.62528606  2.08377675 ... -1.61432173  0.79084268
  -0.39725809]]


In [15]:
# confirming that the dataset is standardised / scaled 

print( X_train_scaled.mean(axis=0) ) 
#### this may print you with sicentific notation. 
#### In such cases, you set np.set_printoptions(suppress=True) 

print( X_train_scaled.std(axis=0) )

[ 0. -0. -0. -0.  0. -0. -0. -0. -0. -0. -0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## Scaling Method2 : 

In [16]:
#### we are using this method so that we can make the performance of model more realistic

Here’s what that process looks like:

  1. Fit the transformer on the training set (saving the means and standard deviations)
  2. Apply the transformer to the training set (scaling the training data)
  3. Apply the transformer to the test set (using the same means and standard deviations)

In [17]:
# 1.saving the means and standard deviations
scaler = preprocessing.StandardScaler().fit(X_train)

In [18]:
## confrimating means and standard deviations are saved 
X_train_scaled = scaler.transform(X_train)

print( X_train_scaled.mean(axis=0))
print( X_train_scaled.std(axis=0))

[ 0. -0. -0. -0.  0. -0. -0. -0. -0. -0. -0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [19]:
# 2. scaling the training data

X_test_scaled = scaler.transform(X_test)

print( X_test_scaled.mean(axis=0))
print( X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [20]:
##### Notice that the result is not perfectly centred at yero (the reuslt is not showing 0)
##### This is because we’re transforming the test set using the means from the training set, not from the test set itself.

In [21]:
##### In practice, when we set up the cross-validation pipeline, we won’t even need to manually fit the Transformer API. 
##### Instead, we’ll simply declare the class object, like so

pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100,
                                               random_state=123))

##### a modeling pipeline that first transforms the data using StandardScaler() and then fits a model using a random forest regressor

# STEP6: Declare hzperparameters

There are two types of parameters we need to worry about: 
1. model parameters --- Models parameters can be learned directly from the data (i.e. regression coefficients)
2. hyperparameters --- while hyperparameters cannot. 
   Hyperparameters express “higher-level” structural information about the model, and they are typically set before training the model.

In [22]:
# declaring hzperparameters

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

# STEP7: Tune model using a cross-validation pipeline

Cross-validation is a process for reliably estimating the performance of a method for building a model by training and evaluating your model multiple times using the same method.

Steps for cross validatoin (CV)

1. Split your data into k equal parts, or “folds” (typically k=10).
2. Train your model on k-1 folds (e.g. the first 9 folds).
3. Evaluate it on the remaining “hold-out” fold (e.g. the 10th fold).
4. Perform steps (2) and (3) k times, each time holding out a different fold.
5. Aggregate the performance across all k folds. This is your performance metric.


In [27]:
# Scikit Learn has the predefined method to do this in just on line of code
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

40 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HayateSato\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HayateSato\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HayateSato\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\HayateSato\anaconda3\Lib\site-packages\sk

In [26]:
print( clf.best_params_ )

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}
