In [12]:
# !pip install tensorflow

In [26]:
# main.py
from Functions import Basic_info_func, Remove_outliers_with_lof, Select_k_best_features

# importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Path = /OneDrive/Desktop/MS-AAi/Course_500_Probability/Project_AAI500-A1 

In [27]:
#loading dataset 
df = pd.read_csv('./Data/train.csv')

In [28]:
independent_variables = df.drop('critical_temp', axis = 1)
target_variable = df['critical_temp']
    
train_X, test_X, train_y, test_y = train_test_split(independent_variables, target_variable, 
                                                    test_size=0.2, random_state=0, shuffle=True)


### Outlier Detection and removal

In [None]:
# Determine the number of features and calculate the number of subplots needed
num_features = len(train_X)

# Create the subplots
sns.set_style('darkgrid')
fig, ax = plt.subplots(9, 9, figsize=(15, 10))

# Flatten the axes array for easy iteration
ax_flat = ax.flatten()

# Iterate over each element property and corresponding axis
for property_name, axis in zip(train_X, ax_flat):
    sns.kdeplot(data=df, x=property_name , ax=axis)

# Hide empty subplots if any
for axis in ax_flat[num_features:]:
    axis.axis('off')

plt.tight_layout()
plt.suptitle('Distribution of independent features', fontsize=16, y=1.05)
plt.show()


Notice that we have features that seem to have some extereme values, such as wtd_range_FusionHeat and mean_Density. In order to tackle these extremet points we can use a Machine lerning approch named local outlier factor that can help us predicting outliers and removing them. 

In [None]:
new_train_X, new_train_y  = Remove_outliers_with_lof(train_X, train_y, contamination = 0.05)

#### Features Selection

In the data analysis part we observed that our entire data has many highly colinear features that causes multi colinearity. 


### Linear Model 

In [None]:
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(new_train_X)
scaled_test_X = scaler.transform(test_X)

In [None]:
# Step 2: Initialize the linear regression model
simple_linear_regression = LinearRegression()

# Step 3: Fit the model on the scaled training data
simple_linear_regression.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = simple_linear_regression.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')


# Testing Results
test_preds = simple_linear_regression.predict(scaled_test_X)

#Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')


#### SVR

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 2: Initialize the Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Step 3: Fit the model on the scaled training data
gb_model.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = gb_model.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')

# Step 5: Predict on the testing set
test_preds = gb_model.predict(scaled_test_X)

# Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')


### XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 2: Initialize the XGBoost Regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.9, max_depth=3, random_state=42)

# Step 3: Fit the model on the scaled training data
xgb_model.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = xgb_model.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')

# Step 5: Predict on the testing set
test_preds = xgb_model.predict(scaled_test_X)

# Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')


### NN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define the neural network architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(scaled_train_X.shape[1],)),
    Dropout(0.4),  # Example of adding dropout for regularization
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression task
])

# Compile the model with appropriate optimizer and loss function
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mae', 'mse'])

# Print the model summary to understand the architecture and number of parameters
model.summary()

# Train the model
history = model.fit(scaled_train_X, new_train_y, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate on training set
train_preds = model.predict(scaled_train_X)
train_rmse = np.sqrt(mean_squared_error(new_train_y, train_preds))
train_mae = mean_absolute_error(new_train_y, train_preds)
train_r2 = r2_score(new_train_y, train_preds)

print('Training results:')
print(f'Training RMSE: {train_rmse:.5f}')
print(f'Training MAE: {train_mae:.5f}')
print(f'Training R2_score: {train_r2:.5f}')

# Evaluate on test set
test_preds = model.predict(scaled_test_X)
test_rmse = np.sqrt(mean_squared_error(test_y, test_preds))
test_mae = mean_absolute_error(test_y, test_preds)
test_r2 = r2_score(test_y, test_preds)

print('\nTesting results:')
print(f'Testing RMSE: {test_rmse:.5f}')
print(f'Testing MAE: {test_mae:.5f}')
print(f'Testing R2_score: {test_r2:.5f}')


In [None]:
!pip install lightgbm

In [None]:
# Import the early stopping callback
from lightgbm import early_stopping

# Train the model with early stopping callback
model = lgb.train(
    params, 
    train_data, 
    num_boost_round=1000, 
    valid_sets=[train_data, test_data], 
    callbacks=[early_stopping(stopping_rounds=100)]
)

# The rest of the code remains the same
train_preds = model.predict(scaled_train_X, num_iteration=model.best_iteration)
test_preds = model.predict(scaled_test_X, num_iteration=model.best_iteration)

# Evaluate the model
train_rmse = mean_squared_error(new_train_y, train_preds, squared=False)
train_mae = mean_absolute_error(new_train_y, train_preds)
train_r2 = r2_score(new_train_y, train_preds)

print('Training results:')
print(f'Training RMSE: {train_rmse:.5f}')
print(f'Training MAE: {train_mae:.5f}')
print(f'Training R2_score: {train_r2:.5f}')

test_rmse = mean_squared_error(test_y, test_preds, squared=False)
test_mae = mean_absolute_error(test_y, test_preds)
test_r2 = r2_score(test_y, test_preds)

print('\nTesting results:')
print(f'Testing RMSE: {test_rmse:.5f}')
print(f'Testing MAE: {test_mae:.5f}')
print(f'Testing R2_score: {test_r2:.5f}')


In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 8, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200, 500, 1000]
}

# Create a base model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Instantiate the grid search model
grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                                 scoring='neg_mean_squared_error', n_iter=50, 
                                 cv=5, verbose=1, n_jobs=-1, random_state=42)

# Fit the grid search to the data
grid_search.fit(scaled_train_X, new_train_y)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Train with the best parameters
best_params = grid_search.best_params_
model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', seed=42)
model.fit(scaled_train_X, new_train_y)

# Make predictions
train_preds = model.predict(scaled_train_X)
test_preds = model.predict(scaled_test_X)

# Evaluate the model
train_rmse = mean_squared_error(new_train_y, train_preds, squared=False)
train_mae = mean_absolute_error(new_train_y, train_preds)
train_r2 = r2_score(new_train_y, train_preds)

print('Training results:')
print(f'Training RMSE: {train_rmse:.5f}')
print(f'Training MAE: {train_mae:.5f}')
print(f'Training R2_score: {train_r2:.5f}')

test_rmse = mean_squared_error(test_y, test_preds, squared=False)
test_mae = mean_absolute_error(test_y, test_preds)
test_r2 = r2_score(test_y, test_preds)

print('\nTesting results:')
print(f'Testing RMSE: {test_rmse:.5f}')
print(f'Testing MAE: {test_mae:.5f}')
print(f'Testing R2_score: {test_r2:.5f}')
