In [13]:
# Importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Functions import (Basic_info_func, Remove_outliers_with_lof, Select_k_best_features, Adjusted_r2_score,
                       Evaluation_results)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.activations import tanh
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
import xgboost as xgb
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Path = /OneDrive/Desktop/MS-AAi/Course_500_Probability/Project_AAI500-A1 

In [14]:
#loading dataset 
train_df = pd.read_csv('./Data/train_df.csv')
test_df = pd.read_csv('./Data/test_df.csv')

X_train = train_df.drop('critical_temp', axis = 1)
y_train = train_df['critical_temp']

X_test = test_df.drop('critical_temp', axis = 1)
y_test = test_df['critical_temp']

In [15]:
new_train_X, new_train_y  = Remove_outliers_with_lof(X_train, y_train, contamination = 0.1)

Shape before outlier removal:
(18073, 82)

Shape after outlier removal:
(16265, 82)


#### Features Selection

In the data analysis part we observed that our entire data has many highly colinear features that causes multi colinearity. 


In [16]:
# Assuming train_X and test_X are your training and test datasets
pca = PCA(n_components=30) 
# 1. Fit scaler (or PCA) on training data
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(new_train_X)
train_X_pca = pca.fit_transform(train_X_scaled)

# 2. Transform both training and test data using the fitted scaler (or PCA)
train_X_pca = pca.transform(train_X_scaled)
test_X_scaled = scaler.transform(X_test)
test_X_pca = pca.transform(test_X_scaled)



In [17]:
# Step 1: Initialize the linear regression model
simple_linear_regression = LinearRegression()

# Step 2: Fit the model on the scaled training data
simple_linear_regression.fit(train_X_pca, new_train_y)

# Step 3: Predict on the training set
train_preds = simple_linear_regression.predict(train_X_pca)

# Testing results
print('Linear Regression Results')
num_features = train_X_pca.shape[1]
train_metrics = Evaluation_results(new_train_y, train_preds, objective = 'train', num_features  = num_features)
train_metrics

# # Testing Results
test_preds = simple_linear_regression.predict(test_X_pca)
test_metrics = Evaluation_results(y_test, test_preds, objective = 'test', num_features  = num_features)
test_metrics


Linear Regression Results

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Training results:
Training RMSE: 18.89155
Training MAE: 14.69187
Training R2 score: 0.69736
Training Adjusted R2 score: 0.69680

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Testing results:
Testing RMSE: 19.90200
Testing MAE: 15.57628
Testing R2 score: 0.66114
Testing Adjusted R2 score: 0.65792


In [18]:
# Define the neural network architecture
xgb_model = Sequential([
    Dense(512, input_shape=(train_X_pca.shape[1],), activation=tanh),
    Dropout(0.4),
    Dense(256, activation=tanh),
    Dropout(0.3),
    Dense(128, activation=tanh),
    Dropout(0.2),
    Dense(64, activation=tanh),
    Dense(32, activation=tanh),
    Dense(1)  # Output layer for regression task
])


# Compile the model with appropriate optimizer and loss function
xgb_model.compile(optimizer=Adam(learning_rate=0.05),
              loss='mean_squared_error',
              metrics=['mae', 'mse'])

# Train the model
history = xgb_model.fit(train_X_pca, new_train_y, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate on training set
NN_train_preds = model.predict(train_X_pca)

print('Neuaral Network Results')
num_features = train_X_pca.shape[1]
train_metrics = Evaluation_results(new_train_y, NN_train_preds, objective = 'train', num_features  = num_features)
train_metrics

# Step 5: Predict on the testing set
NN_test_preds = xgb_model.predict(test_X_pca)

num_features = train_X_pca.shape[1]
train_metrics = Evaluation_results(y_test, NN_test_preds, objective = 'test', num_features  = num_features)
train_metrics


Epoch 1/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 849.1191 - mae: 21.1562 - mse: 849.1191 - val_loss: 436.0328 - val_mae: 14.6757 - val_mse: 436.0328
Epoch 2/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 489.3618 - mae: 16.1159 - mse: 489.3618 - val_loss: 529.7430 - val_mae: 17.6841 - val_mse: 529.7430
Epoch 3/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 518.5899 - mae: 16.8951 - mse: 518.5899 - val_loss: 509.0418 - val_mae: 16.2764 - val_mse: 509.0418
Epoch 4/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 514.1879 - mae: 16.8569 - mse: 514.1879 - val_loss: 501.2987 - val_mae: 15.5664 - val_mse: 501.2987
Epoch 5/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 506.4456 - mae: 16.6379 - mse: 506.4456 - val_loss: 502.4236 - val_mae: 16.5817 - val_mse: 502.4236
Epoch 6/50
[1m407/407[0m [3

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 492.4550 - mae: 16.1404 - mse: 492.4550 - val_loss: 501.2061 - val_mae: 15.9886 - val_mse: 501.2061
Epoch 44/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 493.9358 - mae: 16.1363 - mse: 493.9358 - val_loss: 497.0743 - val_mae: 15.9989 - val_mse: 497.0743
Epoch 45/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 496.6648 - mae: 16.0591 - mse: 496.6648 - val_loss: 490.9183 - val_mae: 16.1636 - val_mse: 490.9183
Epoch 46/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 478.2502 - mae: 15.9725 - mse: 478.2502 - val_loss: 489.7637 - val_mae: 16.0426 - val_mse: 489.7637
Epoch 47/50
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 487.8615 - mae: 16.1093 - mse: 487.8615 - val_loss: 489.6772 - val_mae: 15.9470 - val_mse: 489.6772
Epoch 48/50
[1m407/407[0m [32m━━━━

In [19]:
# Step 1: Initialize the XGBoost Regressor model
# using RandomSearchCV Best parameters: {'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=10,reg_lambda = 0.6,
                             random_state=42)

# Step 2: Fit the model on the scaled training data
xgb_model.fit(train_X_pca, new_train_y)

# Step 3: Predict on the training set
xgb_train_preds = xgb_model.predict(train_X_pca)

# Training results
print('XGBoost Results')
num_features = train_X_pca.shape[1]
train_metrics = Evaluation_results(new_train_y, xgb_train_preds, objective = 'train', num_features  = num_features)
train_metrics

# Predict on the testing set
xgb_test_preds = xgb_model.predict(test_X_pca)

# Testing results
num_features = train_X_pca.shape[1]
train_metrics = Evaluation_results(y_test, xgb_test_preds, objective = 'test', num_features  = num_features)
train_metrics

XGBoost Results

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Training results:
Training RMSE: 4.70945
Training MAE: 2.48831
Training R2 score: 0.98119
Training Adjusted R2 score: 0.98116

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Testing results:
Testing RMSE: 11.27645
Testing MAE: 6.26300
Testing R2 score: 0.89121
Testing Adjusted R2 score: 0.89018
