<div style="text-align:center">
    <img src="../../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 13: AL-CNT-SiNT</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
data = pd.read_csv("data.csv")

##### Remove unneccessary columns

In [None]:
data.drop("Run Order", axis=1, inplace = True)

#### EDA

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.groupby('C').count()

In [None]:
data.groupby('M').count()

#### Strorytelling - Visualization

In [None]:
sns.lineplot(x='T', y='E', hue='M', data=data)

In [None]:
sns.lineplot(x='S', y='E', hue='M',data=data)

In [None]:
sns.lineplot(x='C', y='E', hue= 'M',data=data)

In [None]:
corr = data.corr()

plt.subplots(figsize=(8, 8))
sns.heatmap(corr, annot=True, square=True)
plt.show()

In [None]:
corr

##### Convert non-numeric values (Encoding the independent variables)

In [None]:
categorical_cols = ['C', 'M']
data = pd.get_dummies(data, columns=categorical_cols)

In [None]:
data.head()

#### Train and test (Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

##### Global functions

In [None]:
def plot_test_prediction(y_test, y_pred):
    map = sns.cubehelix_palette(as_cmap=True)
    f, ax = plt.subplots()
    points = ax.scatter(y_test, y_pred, c=y_test, cmap=map)
    f.colorbar(points)
    plt.xlabel("Y Test")
    plt.ylabel("Predicted Y")
    plt.show()

In [None]:
from prettytable import PrettyTable

def print_metrics_table(y_test, y_pred):
    data = [[
        metrics.r2_score(y_test, y_pred),
        metrics.mean_absolute_error(y_test, y_pred),
        metrics.mean_squared_error(y_test, y_pred),
        np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    ]]
    df = pd.DataFrame(data, columns=['R2 Score', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error'])

    generate_ascii_table(df)

def generate_ascii_table(df):
    x = PrettyTable()
    x.field_names = df.columns.tolist()
    for row in df.values:
        x.add_row(row)
    print(x)
    return x 

In [None]:
y = data['E']
X = data.drop(['E', 'UTS'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train

In [None]:
len(X_train)

In [None]:
y_train

In [None]:
X_test

In [None]:
len(X_test)

#### Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

In [None]:
X_test

#### Regression models

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

In [None]:
names = [
    'Multiple-linear',
    'Polynomial',
    'Gradient boosting',
    'Lasso',
    'Ridge',
    'Random forest',
    'SVR',
    'Bayesian Ridge',
    'Decision Tree',
    'XGBoost',
    'MLP'
]

regressors = [
    LinearRegression(),
    PolynomialRegression(),
    GradientBoostingRegressor(),
    Lasso(),
    Ridge(),
    RandomForestRegressor(),
    SVR(),
    BayesianRidge(),
    DecisionTreeRegressor(),
    XGBRegressor(),
    MLPRegressor()
]

parameters = [
    {
    },
    {
        'polynomialfeatures__degree': [2, 3, 4],
        'linearregression__fit_intercept': [True, False],
        'linearregression__normalize': [True, False]
    },
    {
        'learning_rate': [0.01,0.02,0.03,0.04],
        'subsample'    : [0.9, 0.5, 0.2, 0.1],
        'n_estimators' : [100,500,1000, 1500],
        'max_depth'    : [4,6,8,10]
    },
    {
        'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]
    },
    {
        'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]
    },
    {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [5]
    },
    {
        'kernel' : ['rbf'],
        'C': [1, 10, 100, 1000],
        'epsilon': [0.001, 0.01, 0.1, 1, 10],
        'gamma': [0.0001, 0.001, 0.01, 0.1, 1]
    },
    {
    },
    {
        'random_state': [0]
    },
    {
        'n_estimators': [1000],
        'learning_rate': [0.08],
        'subsample': [0.75],
        'colsample_bytree': [1], 
        'max_depth': [7],
        'gamma': [0],  
    },
    {
        'hidden_layer_sizes': [(100), (100, 50), (100, 50, 25)],
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'random_state': [1],
        'max_iter': [10000]
    }
]

results = []

for name, regressor, params in zip(names, regressors, parameters):
    gsearch = GridSearchCV(regressor, param_grid=params, n_jobs=-1)
    fitted = gsearch.fit(X_train, y_train)
    y_pred = gsearch.predict(X_test)
    score = fitted.score(X_test, y_test)

    results.append({
        'Name' : name,
        'Model': gsearch,
        'Parameters': gsearch.best_params_,
        'Score': score,
        'Predictions': y_pred
    })

    print(f"{name} training finished.")

In [None]:
results.sort(key = lambda x: x['Score'], reverse = True)

In [None]:
for result in results:
    print(f"Model: {result['Name']}")
    print(f"Parameters: {result['Parameters']}")
    print(f"Cross-validation R2 Score: {result['Score']}")
    sns.histplot(y_test - result['Predictions'])
    plot_test_prediction(y_test, result['Predictions'])
    print_metrics_table(y_test, result['Predictions'])
    print("*" * 50)

#### Best predicted output

In [None]:
T_uniq_values = data['T'].unique()
S_uniq_values = data['S'].unique()
C_uniq_values = ['(3,3)', '(4,4)', '(5,5)']
M_uniq_values = ['A', 'B']

print(T_uniq_values)
print(S_uniq_values)
print(C_uniq_values)
print(M_uniq_values)

In [None]:
import random

all_combinations_count = len(T_uniq_values) * len(S_uniq_values) * len(C_uniq_values) * len(M_uniq_values)
rows = []
print(all_combinations_count)
while len(rows) < all_combinations_count:
    rows.append(
        (
            random.choice(T_uniq_values),
            random.choice(S_uniq_values),
            random.choice(C_uniq_values),
            random.choice(M_uniq_values)
        )
    )
    rows = list(set(rows))

all_rand_data = pd.DataFrame(rows, columns=['T', 'S', 'C', 'M'])

all_rand_data.head()

In [None]:
categorical_cols = ['C', 'M']
all_rand_data_dummies = pd.get_dummies(all_rand_data, columns=categorical_cols)

all_rand_data_dummies

In [None]:
all_rand_data_transformed = scaler.transform(all_rand_data_dummies)

##### Max E value

In [None]:
best_model = results[0]['Model']
best_score = results[0]['Score']
best_model_name = results[0]['Name']
best_params = results[0]['Parameters']

In [None]:
predicted_E_values = best_model.predict(all_rand_data_transformed)

# print(predicted_E_values)

predicted_E_max = best_model.predict([all_rand_data_transformed[np.argmax(predicted_E_values)]])

print(f"Best Model: {best_model_name}")
print(f"Best R2 Score: {best_score}")
print(f"Best Parameters: {best_params}")
print("\n")

print("Input values:")
print(all_rand_data_dummies.iloc[np.argmax(predicted_E_values)])

print("\n")

print("Expected max E:")
print(predicted_E_max)