# Data Loading and Preprocessing

In [None]:
import pandas as pd

# Path to your Excel file
file_path = 'Project data set 1 (for reports 1 and 3) .xlsx'

# Load the Excel file
excel_file = pd.ExcelFile(file_path)

# Create lists to store data for combined-1 and combined-2
combined_1_data = []
combined_2_data = []

# Get the first 12 sheets for combined-1
for sheet in excel_file.sheet_names[:12]:  # First 12 sheets
    sheet_data = pd.read_excel(excel_file, sheet_name=sheet)  # Read each sheet
    combined_1_data.append(sheet_data)

# Get the next 5 sheets for combined-2
for sheet in excel_file.sheet_names[12:17]:  # Next 5 sheets
    sheet_data = pd.read_excel(excel_file, sheet_name=sheet)  # Read each sheet
    combined_2_data.append(sheet_data)

# Concatenate data into two DataFrames
combined_Train = pd.concat(combined_1_data)
combined_Test = pd.concat(combined_2_data)

# Save the combined data into two new Excel files
combined_Train.to_excel('combined_Train.xlsx', index=False)
combined_Test.to_excel('combined_Test.xlsx', index=False)

# Display the first few rows of both datasets
print("combined_Train Data:")
print(combined_Train.head())

print("\ncombined_Test Data:")
print(combined_Test.head())

# Summary Statistics

In [None]:
# Train Data
print(combined_Train.describe())  

In [None]:
#Test Data
print(combined_Test.describe()) 

# Correlation Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(combined_Train.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Feature Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Splitting the data into training and testing sets
x_train = combined_Train.drop('BIS', axis=1)
y_train = combined_Train['BIS']
x_test = combined_Test.drop('BIS', axis=1)
y_test = combined_Test['BIS']

# Standardizing the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Random Forest for feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train_scaled, y_train)
rf_importances = rf.feature_importances_


# Displaying feature importances and linear regresssion coefficients
features = pd.DataFrame({
    'Feature': x_train.columns,
    'RandomForest Importance': rf_importances,
})

# Display the feature importance and coefficients
print(features)

# Random Forest Feature Importance Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

features_rf = features.drop(columns='LinearRegression Coefficients')
features_rf = features_rf.sort_values(by='RandomForest Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='RandomForest Importance', y='Feature', data=features_rf)
plt.title('Feature Importances from Random Forest')
plt.show()

# Moldel Training and Prediction

# Random Forest and Linear Regression

In [None]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train_scaled[:, [0, 3, 4, 6]], y_train)
rf_predictions = rf_model.predict(x_test_scaled[:, [0, 3, 4, 6]])

In [None]:
#Linear Regression
lr_model = LinearRegression()
lr_model.fit(x_train_scaled[:, [0, 3, 4, 6]], y_train)
lr_predictions = lr_model.predict(x_test_scaled[:, [0, 3, 4, 6]])

# Accessing the coefficients and intercept of the model
coefficients = lr_model.coef_
intercept = lr_model.intercept_

# Displaying feature importances and linear regresssion coefficients
linearRegression_model = pd.DataFrame({
    'Feature': x_train.columns[[0, 3, 4, 6]],
    'Coefficients': coefficients
})

print (linearRegression_model)

# Final Model

# Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

#Evaluating Random Forest Model

rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
rf_pearson = pearsonr(y_test, rf_predictions)[0]

print (rf_mse)
print (rf_r2)
print (rf_pearson)

In [None]:
#Evaluating Linear Regression Model

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
lr_pearson = pearsonr(y_test, lr_predictions)[0]

print (lr_mse)
print (lr_r2)
print (lr_pearson)

# Scatter Plots of Predictions vs. Actual Values

In [None]:
#Scatter Plots of Predictions vs. Actual Values
#Random Forest
plt.figure(figsize=(12, 6))
plt.scatter(y_test, rf_predictions, color='blue', alpha=0.5, label='Random Forest')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('BIS Index')
plt.ylabel('New Index')
plt.title('BIS Index vs. New Index')
plt.legend()
plt.show()

In [None]:
#Scatter Plots of Predictions vs. Actual Values
#Linear Regression
plt.figure(figsize=(12, 6))
plt.scatter(y_test, lr_predictions, color='green', alpha=0.5, label='Linear Regression')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('BIS Index')
plt.ylabel('New Index')
plt.title('BIS Index vs. New Index')
plt.legend()
plt.show()

# Bland-Altman Plots

In [None]:
import numpy as np
def bland_altman_plot(data1, data2, title):
    mean = np.mean([data1, data2], axis=0)
    diff = data1 - data2
    md = np.mean(diff)
    sd = np.std(diff)

    plt.figure(figsize=(10, 5))
    plt.scatter(mean, diff, color='blue')
    plt.axhline(md, color='gray', linestyle='--')
    plt.axhline(md + 1.96*sd, color='red', linestyle='--')
    plt.axhline(md - 1.96*sd, color='red', linestyle='--')
    plt.title(title)
    plt.xlabel('Mean of Two Measurements')
    plt.ylabel('Difference Between Two Measurements')
    plt.show()

In [None]:
#Bland-Altman Plot for Random Forest Model
bland_altman_plot(y_test, rf_predictions, 'Bland-Altman Plot for Random Forest')

In [None]:
#Bland-Altman Plot for Linear Regression Model
bland_altman_plot(y_test, lr_predictions, 'Bland-Altman Plot for Linear Regression')