In [84]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [85]:
arabica = pd.read_csv('../data/train/arabica_train.csv')
arabica.drop(columns=['Unnamed: 0'], inplace=True)
arabica

Unnamed: 0,Country.of.Origin,Variety,Processing.Method,Moisture,Harvest.Year,Color,altitude_mean_meters,Category.One.Defects,Category.Two.Defects,Total.Cup.Points
0,6,0,4,0.10,2017,1,1700.000000,0,2,82.42
1,11,0,4,0.11,2012,1,900.000000,0,20,68.33
2,7,2,0,0.10,2013,1,1450.000000,0,5,83.17
3,0,16,0,0.00,2016,1,1.000000,0,0,82.42
4,15,15,4,0.10,2016,0,968.000000,0,4,84.08
...,...,...,...,...,...,...,...,...,...,...
711,16,0,4,0.12,2014,1,1550.000000,0,4,82.67
712,6,3,4,0.11,2016,1,340.380128,0,6,84.17
713,15,15,4,0.09,2014,1,550.000000,0,0,80.92
714,1,1,4,0.10,2015,1,1100.000000,0,4,83.58


In [86]:
X = arabica.drop(columns=['Total.Cup.Points'])
y = arabica['Total.Cup.Points']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state= 5)

In [88]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(572, 9)
(144, 9)
(572,)
(144,)


In [92]:
pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ('pca', PCA(8)),
                       ('regressor', LinearRegression())])

linear_params = {'scaler' : [StandardScaler()],
                 'regressor': [LinearRegression()]}

en_params = {'scaler' : [StandardScaler()],
             'regressor': [ElasticNet()]}

dt_params = {'scaler' : [StandardScaler(), None],
             'regressor': [DecisionTreeRegressor()]}

knn_params = {'scaler' : [StandardScaler()],
              'regressor': [KNeighborsRegressor()],
              'regressor__n_neighbors': [3,9,15]}

svr_params = {'scaler' : [StandardScaler()],
              'regressor': [SVR()],
              'regressor__C': [1]}

In [93]:
search_space = [linear_params, en_params, dt_params, knn_params, knn_params, svr_params]

reg = GridSearchCV(estimator = pipe,
                   param_grid = search_space,
                   cv=3,
                   scoring='r2',
                   n_jobs=-1)

In [94]:
reg.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA(n_components=8)),
                                       ('regressor', LinearRegression())]),
             n_jobs=-1,
             param_grid=[{'regressor': [LinearRegression()],
                          'scaler': [StandardScaler()]},
                         {'regressor': [ElasticNet()],
                          'scaler': [StandardScaler()]},
                         {'regressor': [DecisionTreeRegressor()],
                          'scaler': [StandardScaler(), None]},
                         {'regressor': [KNeighborsRegressor()],
                          'regressor__n_neighbors': [3, 9, 15],
                          'scaler': [StandardScaler()]},
                         {'regressor': [KNeighborsRegressor()],
                          'regressor__n_neighbors': [3, 9, 15],
                          'scaler': [StandardScaler()]},
              

In [78]:
reg.best_estimator_


Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=8)),
                ('regressor', SVR(C=1))])

In [79]:
reg.best_params_

{'regressor': SVR(C=1), 'regressor__C': 1, 'scaler': StandardScaler()}

In [80]:
best_model = reg.best_estimator_

In [81]:
predictions = best_model.predict(X_test)

In [95]:
X.columns

Index(['Country.of.Origin', 'Variety', 'Processing.Method', 'Moisture',
       'Harvest.Year', 'Color', 'altitude_mean_meters', 'Category.One.Defects',
       'Category.Two.Defects'],
      dtype='object')

In [60]:
from sklearn import metrics

In [61]:
def train_test_evaluate(model_name, model, X_train, y_train, X_test, y_test):  

    model.fit(X_train, y_train)
    
    y_pred= model.predict(X_test)
    
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2 = metrics.r2_score(y_test, y_pred)
    y_test_mean= y_test.mean()
    mae_ratio= mae/y_test_mean
    rmse_ratio= rmse/y_test_mean
    
    result_df = pd.DataFrame(
        data=[[model_name, mae, mse, rmse, r2, mae_ratio, rmse_ratio]], 
        columns=["Model", 'MAE', 'MSE', 'RMSE', 'R2 Score', "MAE Ratio", "RMSE Ratio"])
    return result_df

In [82]:
train_test_evaluate("pipeline", pipe, X_train, y_train, X_test, y_test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,MAE Ratio,RMSE Ratio
0,pipeline,1.608917,5.403098,2.324457,-0.038751,0.019557,0.028255
