In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score
from tensorflow import keras
from tensorflow.keras import layers
import logging

In [5]:
logging.basicConfig(filename="Model_selection.log", level=logging.INFO, format='%(asctime)s %(message)s',
					datefmt="%Y-%m-%d %H:%M:%S")

In [7]:
logging.info("Reading training dataset.........")
x_train =  pd.read_csv("x_train.csv")
y_train =  pd.read_csv("y_train.csv")
logging.info("Read the training data successfully.")

In [8]:
logging.info("Reading testing dataset........")
x_test =  pd.read_csv("x_test.csv")
y_test =  pd.read_csv("y_test.csv")
logging.info("Read the testing data successfully.")


In [9]:
logging.info("Reading validation dataset........")
x_valid =  pd.read_csv("x_valid.csv")
y_valid =  pd.read_csv("y_valid.csv")
logging.info("Read the validation data successfully.")

In [10]:
logging.info("Creating the pipelines.....")

In [11]:
# Creating pipelies.
pipe1 = Pipeline([("minmax_scalar", MinMaxScaler()), ("logistic_regression", LogisticRegression())])

In [12]:
pipe2 = Pipeline([("minmax_scalar", MinMaxScaler()), ("KNN", KNeighborsClassifier())])

In [13]:
pipe3 = Pipeline([("minmax_scalar", MinMaxScaler()), ("svm", SVC())])

In [14]:
pipe4 = Pipeline([("minmax_scalar", MinMaxScaler()), ("XGboost", XGBClassifier())])

In [15]:
pipe5 = Pipeline([("minmax_scalar", MinMaxScaler()), ("decision_tree", DecisionTreeClassifier())])

In [16]:
pipe6 = Pipeline([("minmax_scalar", MinMaxScaler()), ("random_forest", RandomForestClassifier())])
logging.info("Pipelines created.")

In [17]:
logging.info("Builiding an ANN model....")
def build_ann():
    
    model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[54]),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')])
    
    model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"])
    
    return model

In [18]:
ann_model = build_ann()
logging.info("ANN built and compiled.")

In [19]:
logging.info("Fitting the pipelines and ANN")
# Fitting the pipelines
pipelines = [pipe1, pipe2, pipe3, pipe4, pipe5, pipe6]

In [20]:
for pipe in pipelines:
    pipe.fit(x_train, y_train)
logging.info("All the pipelines fitted.") 

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [21]:
callback = keras.callbacks.EarlyStopping(monitor = "val_binary_accuracy", patience=3, restore_best_weights=True)

In [22]:
history = ann_model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    batch_size=32,
    epochs=100, 
    callbacks = [callback])
logging.info("ANN trained and validated")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [23]:
logging.info("Predicting on validation data")
# Predicting
pred1 = pipe1.predict(x_valid)
pred2 = pipe2.predict(x_valid)
pred3 = pipe3.predict(x_valid)
pred4 = pipe4.predict(x_valid)
pred5 = pipe5.predict(x_valid)
pred6 = pipe6.predict(x_valid)
logging.info("Prediction done.")

In [24]:
logging.info("Displaying the performance metrics of all the model.......")
# Comparing the result of each pipeline and selecting the best pipeline. 
logging.info("Accuracy of Logistic_Regression {}" .format(round(accuracy_score(y_valid, pred1)*100, 2)))
logging.info("Recall of Logistic_Regression {}" .format(round(recall_score(y_valid, pred1),2)))
logging.info("===================================================================")
logging.info("Accuracy of KNN {}" .format(round(accuracy_score(y_valid, pred2)*100, 2)))
logging.info("Recall of KNN {}" .format(round(recall_score(y_valid, pred2),2)))
logging.info("===================================================================")
logging.info("Accuracy of SVC {}" .format(round(accuracy_score(y_valid, pred3)*100,2)))
logging.info("Recall of SVC {}" .format(round(recall_score(y_valid, pred3),2)))
logging.info("===================================================================")
logging.info("Accuracy of xgboost {}" .format(round(accuracy_score(y_valid, pred4)*100,2)))
logging.info("Recall of xgboost {}" .format(round(recall_score(y_valid, pred4),2)))
logging.info("===================================================================")
logging.info("Accuracy of decision_tree {}" .format(round(accuracy_score(y_valid, pred5)*100,2)))
logging.info("Recall of decision_tree {}" .format(round(recall_score(y_valid, pred5),2)))
logging.info("===================================================================")
logging.info("Accuracy of Random_forest {}" .format(round(accuracy_score(y_valid, pred6)*100,2)))
logging.info("Recall of Random_forest {}" .format(round(recall_score(y_valid, pred6),2)))
logging.info("Select the best performing model and Model Selection phase is completed.")