In [20]:
# importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Functions import Basic_info_func, Remove_outliers_with_lof, Select_k_best_features, Apply_pca
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Path = /OneDrive/Desktop/MS-AAi/Course_500_Probability/Project_AAI500-A1 

In [10]:
#loading dataset 
df = pd.read_csv('./Data/train.csv')

In [11]:
independent_variables = df.drop('critical_temp', axis = 1)
target_variable = df['critical_temp']
    
train_X, test_X, train_y, test_y = train_test_split(independent_variables, target_variable, 
                                                    test_size=0.2, random_state=0, shuffle=True)


### Outlier Detection and removal

In [12]:
# # Determine the number of features and calculate the number of subplots needed
# num_features = len(train_X)

# # Create the subplots
# sns.set_style('darkgrid')
# fig, ax = plt.subplots(9, 9, figsize=(15, 10))

# # Flatten the axes array for easy iteration
# ax_flat = ax.flatten()

# # Iterate over each element property and corresponding axis
# for property_name, axis in zip(train_X, ax_flat):
#     sns.kdeplot(data=df, x=property_name , ax=axis)

# # Hide empty subplots if any
# for axis in ax_flat[num_features:]:
#     axis.axis('off')

# plt.tight_layout()
# plt.suptitle('Distribution of independent features', fontsize=16, y=1.05)
# plt.show()


Notice that we have features that seem to have some extereme values, such as wtd_range_FusionHeat and mean_Density. In order to tackle these extremet points we can use a Machine lerning approch named local outlier factor that can help us predicting outliers and removing them. 

In [13]:
new_train_X, new_train_y  = Remove_outliers_with_lof(train_X, train_y, contamination = 0.05)

Shape before outlier removal:
(17010, 82)

Shape after outlier removal:
(16159, 82)


#### Features Selection

In the data analysis part we observed that our entire data has many highly colinear features that causes multi colinearity. 


In [26]:
# Assuming train_X and train_y are already defined and contain the training data

k = 30  # Number of features to select
X_new, scores = Select_k_best_features(new_train_X, new_train_y, k=k, score_func=mutual_info_regression)

print(f"Selected top {k} features:")




Scores of top 30 features in descending order:
std_fie                        0.930362
gmean_Density                  0.924645
entropy_atomic_mass            0.911777
std_ThermalConductivity        0.910415
range_ElectronAffinity         0.905139
range_fie                      0.896992
entropy_atomic_radius          0.889684
wtd_gmean_Valence              0.873904
entropy_Density                0.872597
wtd_mean_Valence               0.870127
range_Density                  0.869347
std_ElectronAffinity           0.860537
entropy_FusionHeat             0.860131
entropy_ElectronAffinity       0.858371
mean_ThermalConductivity       0.851045
gmean_FusionHeat               0.850557
wtd_gmean_Density              0.828712
std_atomic_radius              0.824413
gmean_ThermalConductivity      0.823578
gmean_ElectronAffinity         0.821363
entropy_Valence                0.820703
range_atomic_mass              0.812493
range_atomic_radius            0.810315
mean_FusionHeat                0.

In [28]:
feature_set = X_new.columns
test_X = test_X[feature_set]

In [29]:
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(X_new)
scaled_test_X = scaler.transform(test_X)

### Linear Model

In [30]:
# Step 2: Initialize the linear regression model
simple_linear_regression = LinearRegression()

# Step 3: Fit the model on the scaled training data
simple_linear_regression.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = simple_linear_regression.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')


# Testing Results
test_preds = simple_linear_regression.predict(scaled_test_X)

#Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')

Training results 
 - - - - - - - - - - - - - - - - - - - - 
Training RMSE: 19.79507
Training MAE: 15.08588
Training R2_score: 0.66478
Testing results 
 - - - - - - - - - - - - - - - - - - - - 
Testing RMSE: 20.13932
Testing MAE: 15.46959
Testing R2_score: 0.65152


### SVR

In [31]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 2: Initialize the Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Step 3: Fit the model on the scaled training data
gb_model.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = gb_model.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')

# Step 5: Predict on the testing set
test_preds = gb_model.predict(scaled_test_X)

# Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')

Training results 
 - - - - - - - - - - - - - - - - - - - - 
Training RMSE: 9.61332
Training MAE: 6.44550
Training R2_score: 0.92094
Testing results 
 - - - - - - - - - - - - - - - - - - - - 
Testing RMSE: 11.60691
Testing MAE: 7.54370
Testing R2_score: 0.88425


### XGBoost

In [32]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 2: Initialize the XGBoost Regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.9, max_depth=3, random_state=42)

# Step 3: Fit the model on the scaled training data
xgb_model.fit(scaled_train_X, new_train_y)

# Step 4: Predict on the training set
train_preds = xgb_model.predict(scaled_train_X)

# Training evaluation
print('Training results', '\n', '- '*20)
RMSE = np.sqrt(mean_squared_error(new_train_y, train_preds))
MAE = mean_absolute_error(new_train_y, train_preds)
R2_score = r2_score(new_train_y, train_preds)

print(f'Training RMSE: {RMSE:.5f}')
print(f'Training MAE: {MAE:.5f}')
print(f'Training R2_score: {R2_score:.5f}')

# Step 5: Predict on the testing set
test_preds = xgb_model.predict(scaled_test_X)

# Testing evaluation
print('Testing results', '\n', '- '*20)
RMSE_test = np.sqrt(mean_squared_error(test_y, test_preds))
MAE_test = mean_absolute_error(test_y, test_preds)
R2_score_test = r2_score(test_y, test_preds)

print(f'Testing RMSE: {RMSE_test:.5f}')
print(f'Testing MAE: {MAE_test:.5f}')
print(f'Testing R2_score: {R2_score_test:.5f}')

Training results 
 - - - - - - - - - - - - - - - - - - - - 
Training RMSE: 6.76219
Training MAE: 4.27993
Training R2_score: 0.96088
Testing results 
 - - - - - - - - - - - - - - - - - - - - 
Testing RMSE: 11.67695
Testing MAE: 6.82261
Testing R2_score: 0.88285


### LightGBM

In [42]:
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Prepare the dataset for LightGBM
train_data = lgb.Dataset(scaled_train_X, label=new_train_y)
test_data = lgb.Dataset(scaled_test_X, label=test_y, reference=train_data)

# Define the parameters
params = {
    'objective': 'regression',  # correct objective for regression
    'metric': ['rmse', 'mae'],  # appropriate metrics
    'num_leaves': 31,
    'max_depth': -1,
    'learning_rate': 0.01,
    'verbose': -1
}

# Train the model with early stopping callback
model = lgb.train(
    params, 
    train_data, 
    num_boost_round=1000, 
    valid_sets=[train_data, test_data], 
    callbacks=[early_stopping(stopping_rounds=100)]
)

# Make predictions
train_preds = model.predict(scaled_train_X, num_iteration=model.best_iteration)
test_preds = model.predict(scaled_test_X, num_iteration=model.best_iteration)

# Evaluate the model
train_rmse = mean_squared_error(new_train_y, train_preds, squared=False)
train_mae = mean_absolute_error(new_train_y, train_preds)
train_r2 = r2_score(new_train_y, train_preds)

print('Training results:')
print(f'Training RMSE: {train_rmse:.5f}')
print(f'Training MAE: {train_mae:.5f}')
print(f'Training R2_score: {train_r2:.5f}')

test_rmse = mean_squared_error(test_y, test_preds, squared=False)
test_mae = mean_absolute_error(test_y, test_preds)
test_r2 = r2_score(test_y, test_preds)

print('\nTesting results:')
print(f'Testing RMSE: {test_rmse:.5f}')
print(f'Testing MAE: {test_mae:.5f}')
print(f'Testing R2_score: {test_r2:.5f}')


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 9.00926	training's l1: 5.91684	valid_1's rmse: 11.15	valid_1's l1: 7.10398
Training results:
Training RMSE: 9.00926
Training MAE: 5.91684
Training R2_score: 0.93056

Testing results:
Testing RMSE: 11.15004
Testing MAE: 7.10398
Testing R2_score: 0.89318


In [35]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 8, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200, 500, 1000]
}

# Create a base model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Instantiate the grid search model
grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                                 scoring='neg_mean_squared_error', n_iter=50, 
                                 cv=5, verbose=1, n_jobs=-1, random_state=42)

# Fit the grid search to the data
grid_search.fit(scaled_train_X, new_train_y)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Train with the best parameters
best_params = grid_search.best_params_
model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', seed=42)
model.fit(scaled_train_X, new_train_y)

# Make predictions
train_preds = model.predict(scaled_train_X)
test_preds = model.predict(scaled_test_X)

# Evaluate the model
train_rmse = mean_squared_error(new_train_y, train_preds, squared=False)
train_mae = mean_absolute_error(new_train_y, train_preds)
train_r2 = r2_score(new_train_y, train_preds)

print('Training results:')
print(f'Training RMSE: {train_rmse:.5f}')
print(f'Training MAE: {train_mae:.5f}')
print(f'Training R2_score: {train_r2:.5f}')

test_rmse = mean_squared_error(test_y, test_preds, squared=False)
test_mae = mean_absolute_error(test_y, test_preds)
test_r2 = r2_score(test_y, test_preds)

print('\nTesting results:')
print(f'Testing RMSE: {test_rmse:.5f}')
print(f'Testing MAE: {test_mae:.5f}')
print(f'Testing R2_score: {test_r2:.5f}')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 0.6, 'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.03, 'colsample_bytree': 0.6}
Training results:
Training RMSE: 5.14040
Training MAE: 2.96495
Training R2_score: 0.97739

Testing results:
Testing RMSE: 10.10572
Testing MAE: 5.71245
Testing R2_score: 0.91226
