### The Goal of this notebook is to train and optimize models.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# prepare data

In [3]:
input_data = pd.read_csv("../data/processed/Graduate - IRISES dataset (2019-06)_READY_TO_TRAIN.csv", sep="|")

In [4]:
target_values = input_data["Species"]
predictors = input_data.drop("Species", axis=1)

In [5]:
# split data into train and test sets

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target_values, test_size=0.30, random_state=101)

### Models training, evaluation and save

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [9]:
import joblib

### To predict flower species couple of ML algorithms are considered:
    1) K-Neighbors Classifier
    2) Random Fores
    3) Supporting Vector Machines
    4) Logistic Regression
    5) XGBoost

#### 1) K-Neighbors Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
# In the case of KNN at first, odd numbers are generated in a range from 0 to 15.

In [12]:
k_numbers = []
for k in range(15):
    if k % 2 != 0:
        k_numbers.append(k)

In [13]:
# Searching for optimal parameters

In [14]:
knn_parameters = {"n_neighbors": k_numbers, "p": [1, 2]}
knn = KNeighborsClassifier()
cv = GridSearchCV(knn, knn_parameters, cv=5, iid=False, n_jobs=-1)
cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid=False, n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13], 'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
knn_best_params = cv.best_params_ # best hyperparameters
knn_best_estimator = cv.best_estimator_ # best model

#### 2) Random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rff_parameters = {"n_estimators": (100, 1000)}
rff = RandomForestClassifier(random_state=101)
cv = GridSearchCV(rff, rff_parameters, cv=5, iid=False, n_jobs=-1)
cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=101,
                                 

In [18]:
rff_best_params = cv.best_params_ # best hyperparameters
rff_best_estimator = cv.best_estimator_ # best model

#### 3) Supporting Vector Machines

In [19]:
from sklearn.svm import SVC

In [20]:
svm_parameters = {"C": (0.1, 10), "kernel": ("rbf", "poly", "sigmoid"), "degree": (2, 5), "gamma": ("scale", "auto")}
svm = SVC(random_state=101, probability=True)
cv = GridSearchCV(svm, svm_parameters, n_jobs=-1, iid=False, cv=5)
cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=True, random_state=101, shrinking=True,
                           tol=0.001, verbose=False),
             iid=False, n_jobs=-1,
             param_grid={'C': (0.1, 10), 'degree': (2, 5),
                         'gamma': ('scale', 'auto'),
                         'kernel': ('rbf', 'poly', 'sigmoid')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [21]:
svm_best_params = cv.best_params_ # best hyperparameters
svm_best_estimator = cv.best_estimator_ # best model

#### 4) Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
l1_range = np.arange(0, 1, 0.1)
c_range = np.arange(0.1, 10)
lr_params = {"l1_ratio": l1_range, "C": c_range}
lr = LogisticRegression(penalty="elasticnet", random_state=101, solver="saga", max_iter=1000, l1_ratio=0.5,
                        multi_class="auto")
cv = GridSearchCV(lr, lr_params, n_jobs=-1, iid=False, cv=5)
cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=0.5,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='elasticnet',
                                          random_state=101, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid=False, n_jobs=-1,
             param_grid={'C': array([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1]),
                         'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [24]:
lr_best_params = cv.best_params_ # best hyperparameters
lr_best_estimator = cv.best_estimator_ # best model

#### 5) XGBoost

In [25]:
from xgboost.sklearn import XGBClassifier

In [26]:
xgbc_parameters = {"n_estimators": (100, 1000)}
xgbc = XGBClassifier(random_state=101)
cv = GridSearchCV(xgbc, xgbc_parameters, cv=5, iid=False, n_jobs=-1)
cv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=101, reg_alpha=0,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid=False, n_jobs=-1, param_grid={'n_estimators': (100, 1000)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, ve

In [27]:
xgbc_best_params = cv.best_params_ # best hyperparameters
xgbc_best_estimator = cv.best_estimator_ # best model

In [28]:
# Drop models to ../models/[model_name]

In [29]:
models = {"KNN": knn_best_estimator, 
          "RFF": rff_best_estimator, 
          "SVM": svm_best_estimator, 
          "LR": lr_best_estimator, 
          "XGBC": xgbc_best_estimator}

In [30]:
for model_name in models.keys():
    path = "../models/{}".format(model_name)
    obj_model = models.get(model_name)
    joblib.dump(obj_model, path)