## Regression with GridSearchCV Concrete Data

In [40]:
import pandas as pd
df = pd.read_csv('../Cases/Concrete Strength/Concrete_Data.csv')
df.head(1)

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99


In [41]:
X, y = df.drop(columns=['Strength']), df['Strength']

#### With scaler as StandardScaler

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
knn = KNeighborsRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
pipe = Pipeline(
    [
        ('SCL', scaler),
        ('KNN', knn)
    ]
    
)
pipe.get_params()

{'memory': None,
 'steps': [('SCL', StandardScaler()), ('KNN', KNeighborsRegressor())],
 'verbose': False,
 'SCL': StandardScaler(),
 'KNN': KNeighborsRegressor(),
 'SCL__copy': True,
 'SCL__with_mean': True,
 'SCL__with_std': True,
 'KNN__algorithm': 'auto',
 'KNN__leaf_size': 30,
 'KNN__metric': 'minkowski',
 'KNN__metric_params': None,
 'KNN__n_jobs': None,
 'KNN__n_neighbors': 5,
 'KNN__p': 2,
 'KNN__weights': 'uniform'}

In [43]:
params = {'KNN__n_neighbors': [1,2,3,4,5,6,7,8]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2')
gcv_knn.fit(X, y)

In [44]:
print(gcv_knn.best_params_)
print(gcv_knn.best_score_)

{'KNN__n_neighbors': 3}
0.7162924205032816


#### With scaler as MinMaxScaler

In [45]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

scaler = MinMaxScaler()
knn = KNeighborsRegressor()
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
pipe = Pipeline(
    [
        ('SCL', scaler),
        ('KNN', knn)
    ]
    
)
params = {'KNN__n_neighbors': [1,2,3,4,5,6,7,8]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2')
gcv_knn.fit(X, y)
print(gcv_knn.best_params_)
print(gcv_knn.best_score_)

{'KNN__n_neighbors': 5}
0.6938854701543569


#### Without Scaling

In [46]:
params = {'n_neighbors': [1,2,3,4,5,6,7,8]}
knn = KNeighborsRegressor()
gcv_knn = GridSearchCV(knn, param_grid=params, cv=kfold, scoring='r2')
gcv_knn.fit(X, y)
print(gcv_knn.best_params_)
print(gcv_knn.best_score_)

{'n_neighbors': 3}
0.7046900509877506


#### Combining Scalers with the GridSearch

In [15]:
scl_std, scl_mm = StandardScaler(), MinMaxScaler()
knn = KNeighborsRegressor()

pipe = Pipeline(
    [
        ('SCL', None),
        ('KNN', knn)
    ]
    
)

params = {'KNN__n_neighbors': [1,2,3,4,5,6,7,8],
         'SCL' : [None, scl_std, scl_mm]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='r2')
gcv_knn.fit(X, y)
print(gcv_knn.best_params_)
print(gcv_knn.best_score_)

{'KNN__n_neighbors': 3, 'SCL': StandardScaler()}
0.7162924205032816


#### Standard Scaler is giving better r2 score, so we will use it for inferencing

In [18]:
test = pd.read_csv('../Cases/Concrete Strength/testConcrete.csv')
test.head(1)

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age
0,495,120,0,155,5,866,884,75


In [39]:
y_pred = gcv_knn.predict(test)
test['Strength'] = y_pred
# test

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Al
- Ba
- Ca
- Fe
- K
- ...
Feature names seen at fit time, yet now missing:
- Age
- Blast
- Cement
- Coarse
- Fine
- ...


In [None]:
df_cv = pd.DataFrame(gcv_lr.cv_results_)
df_cv

## Classification with GridSearchCV Glass Identification

In [47]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
import pandas as pd
df = pd.read_csv('../Cases/Glass Identification/Glass.csv')
df.head(1)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed


In [49]:
X, y = df.drop('Type', axis=1), df['Type']

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline

scl_std, scl_mm = StandardScaler(), MinMaxScaler()
lr = LogisticRegression()

pipe = Pipeline(
    [
        ('SCL', None),
        ('LR', lr)
    ]
    
)
pipe.get_params()

{'memory': None,
 'steps': [('SCL', None), ('LR', LogisticRegression())],
 'verbose': False,
 'SCL': None,
 'LR': LogisticRegression(),
 'LR__C': 1.0,
 'LR__class_weight': None,
 'LR__dual': False,
 'LR__fit_intercept': True,
 'LR__intercept_scaling': 1,
 'LR__l1_ratio': None,
 'LR__max_iter': 100,
 'LR__multi_class': 'auto',
 'LR__n_jobs': None,
 'LR__penalty': 'l2',
 'LR__random_state': None,
 'LR__solver': 'lbfgs',
 'LR__tol': 0.0001,
 'LR__verbose': 0,
 'LR__warm_start': False}

In [51]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
l1_ratio = np.linspace(0.001, 1, 5)

params = {'LR__penalty': ['l1', None, 'l2', 'elasticNet'],
          'LR__multi_class': ['ovr', 'multinomial'],
          'LR__l1_ratio': l1_ratio,
          'LR__solver': ['lbfgs', 'sag', 'saga', 'liblinear', 'newton-cg', 'newton-cholesky'],
         'SCL' : [None, scl_std, scl_mm]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv_lr = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='accuracy')
gcv_lr.fit(X, y)
print(gcv_lr.best_params_)
print(gcv_lr.best_score_)

{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ovr', 'LR__penalty': None, 'LR__solver': 'newton-cg', 'SCL': None}
0.6637873754152824


In [52]:
test = pd.read_csv('../Cases/Glass Identification/tst_Glass.csv')
y_pred = gcv_lr.predict(test)
test['Type'] = y_pred
test

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0,building_windows_non_float_processed
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44,building_windows_float_processed
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22,building_windows_float_processed
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1,building_windows_float_processed
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001,building_windows_float_processed
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89,building_windows_float_processed


In [53]:
df_cv = pd.DataFrame(gcv_lr.cv_results_)
df_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LR__l1_ratio,param_LR__multi_class,param_LR__penalty,param_LR__solver,param_SCL,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000999,5.135693e-07,0.0,0.0,0.001,ovr,l1,lbfgs,,"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ov...",,,,,,,,331
1,0.002798,3.994704e-04,0.0,0.0,0.001,ovr,l1,lbfgs,StandardScaler(),"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ov...",,,,,,,,331
2,0.002199,3.999711e-04,0.0,0.0,0.001,ovr,l1,lbfgs,MinMaxScaler(),"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ov...",,,,,,,,331
3,0.000800,3.997803e-04,0.0,0.0,0.001,ovr,l1,sag,,"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ov...",,,,,,,,331
4,0.002586,4.797498e-04,0.0,0.0,0.001,ovr,l1,sag,StandardScaler(),"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'ov...",,,,,,,,331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.002598,4.894536e-04,0.0,0.0,1.0,multinomial,elasticNet,newton-cg,StandardScaler(),"{'LR__l1_ratio': 1.0, 'LR__multi_class': 'mult...",,,,,,,,331
716,0.002598,4.894537e-04,0.0,0.0,1.0,multinomial,elasticNet,newton-cg,MinMaxScaler(),"{'LR__l1_ratio': 1.0, 'LR__multi_class': 'mult...",,,,,,,,331
717,0.000999,2.336015e-07,0.0,0.0,1.0,multinomial,elasticNet,newton-cholesky,,"{'LR__l1_ratio': 1.0, 'LR__multi_class': 'mult...",,,,,,,,331
718,0.002798,3.995657e-04,0.0,0.0,1.0,multinomial,elasticNet,newton-cholesky,StandardScaler(),"{'LR__l1_ratio': 1.0, 'LR__multi_class': 'mult...",,,,,,,,331
