In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [32]:
# Load the data
def load_data(file_path):
    data = pd.read_excel(file_path)
    return data

# Detecting outliers using Z-scores
def detect_outliers_zscore(data, columns, threshold=3):
    outliers = {}
    for col in columns:
        z_scores = np.abs((data[col] - data[col].mean()) / data[col].std())
        outliers[col] = z_scores > threshold
    return outliers

# Preprocess the data
def preprocess_data(data):
    # Handle missing values
    data['Nc'] = data['Nc'].replace('*****', '0').astype('float')

    # Robust Scaling (less sensitive to outliers)
    scaler = RobustScaler()
    data[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']] = scaler.fit_transform(data[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']])

    # Drop unnecessary columns
    data = data.drop(columns=['string_external_id', 'Uniprot_ID', 'CBI'], axis=1)

    return data

# Train the model with polynomial features
def train_model(X, y):
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)

    return model

if __name__ == "__main__":
    file_path = r"C:\Users\shraj\OneDrive\Desktop\freelance-20231007T095553Z-001\freelance\eleg_Final_complied.xls"
    data = load_data(file_path)
    data = preprocess_data(data)

    X = data.drop('abundance', axis=1)
    y = data['abundance']

    model = train_model(X, y)

    # Transform the entire dataset with polynomial features
    X_poly = PolynomialFeatures(degree=2).fit_transform(X)

    # Predict on the entire dataset
    y_pred = model.predict(X_poly)

    mse = mean_squared_error(y, y_pred)
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mse)

    print("R-squared (R²):", r_squared)
    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", rmse)


R-squared (R²): 0.9291390185046546
Mean Absolute Error (MAE): 62.248802380869705
Root Mean Squared Error (RMSE): 89.5050455456519


# Top 10 percent

In [33]:
data1 = pd.read_excel(r"C:\Users\shraj\OneDrive\Desktop\freelance-20231007T095553Z-001\freelance\eleg_Final_complied.xls")
sorted_data = data1.sort_values(by='abundance', ascending=False)
Top10 = sorted_data.head(888)

# Model Building

In [35]:
# Detecting outliers using Z-scores
def detect_outliers_zscore(Top10, columns, threshold=3):
    outliers = {}
    for col in columns:
        z_scores = np.abs((Top10[col] - Top10[col].mean()) / Top10[col].std())
        outliers[col] = z_scores > threshold
    return outliers

# Preprocess the data
def preprocess_data(Top10):
    # Handle missing values
    Top10['Nc'] = Top10['Nc'].replace('*****', '0').astype('float')

    # Robust Scaling (less sensitive to outliers)
    scaler = RobustScaler()
    Top10[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']] = scaler.fit_transform(Top10[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']])

    # Drop unnecessary columns
    Top10 = Top10.drop(columns=['string_external_id', 'Uniprot_ID', 'CBI'], axis=1)

    return Top10

# Train the model with polynomial features
def train_model(X, y):
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)

    return model

if __name__ == "__main__":
    Top10 = preprocess_data(Top10) 

    X = Top10.drop('abundance', axis=1)
    y = Top10['abundance']

    model = train_model(X, y)

    # Transform the entire dataset with polynomial features
    X_poly = PolynomialFeatures(degree=2).fit_transform(X)

    # Predict on the entire dataset
    y_pred = model.predict(X_poly)

    mse = mean_squared_error(y, y_pred)
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mse)

    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R²):", r_squared)
    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", rmse)


Mean Squared Error (MSE): 6.929506209477764e-21
R-squared (R²): 1.0
Mean Absolute Error (MAE): 6.583773766406749e-11
Root Mean Squared Error (RMSE): 8.324365567103455e-11


# Bottom 10 percent

In [36]:
data2 = pd.read_excel(r"C:\Users\shraj\OneDrive\Desktop\freelance-20231007T095553Z-001\freelance\eleg_Final_complied.xls")
sorted_data1 = data2.sort_values(by='abundance', ascending=False)
bot10 = sorted_data1.head(888)

In [37]:
# Detecting outliers using Z-scores
def detect_outliers_zscore(bot10, columns, threshold=3):
    outliers = {}
    for col in columns:
        z_scores = np.abs((bot10[col] - bot10[col].mean()) / bot10[col].std())
        outliers[col] = z_scores > threshold
    return outliers

# Preprocess the data
def preprocess_data(bot10):
    # Handle missing values
    bot10['Nc'] = bot10['Nc'].replace('*****', '0').astype('float')

    # Robust Scaling (less sensitive to outliers)
    scaler = RobustScaler()
    bot10[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']] = scaler.fit_transform(bot10[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']])

    # Drop unnecessary columns
    bot10 = bot10.drop(columns=['string_external_id', 'Uniprot_ID', 'CBI'], axis=1)

    return bot10

# Train the model with polynomial features
def train_model(X, y):
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)

    return model

if __name__ == "__main__":
    bot10 = preprocess_data(bot10)  

    X = bot10.drop('abundance', axis=1)
    y = bot10['abundance']

    model = train_model(X, y)

    # Transform the entire dataset with polynomial features
    X_poly = PolynomialFeatures(degree=2).fit_transform(X)

    # Predict on the entire dataset
    y_pred = model.predict(X_poly)

    mse = mean_squared_error(y, y_pred)
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mse)

    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R²):", r_squared)
    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", rmse)


Mean Squared Error (MSE): 6.929506209477764e-21
R-squared (R²): 1.0
Mean Absolute Error (MAE): 6.583773766406749e-11
Root Mean Squared Error (RMSE): 8.324365567103455e-11


# TEST DATA

In [38]:
test_data = pd.read_excel(r"C:\Users\shraj\OneDrive\Desktop\Rubail_DM_ML.xlsx")

In [39]:
droplist = []

for i in test_data.columns:
    if i not in data.columns:
        droplist.append(i)
        print(i)
        
        
test_data = test_data.drop(droplist, axis=1)

#internal_id
string_external_id
Uniprot
NCBIi_Ids
CBI


In [42]:
# Detecting outliers using Z-scores
def detect_outliers_zscore(test_data, columns, threshold=3):
    outliers = {}
    for col in columns:
        z_scores = np.abs((test_data[col] - test_data[col].mean()) / test_data[col].std())
        outliers[col] = z_scores > threshold
    return outliers

# Preprocess the data
def preprocess_data(test_data):
    # Handle missing values
    test_data['Nc'] = test_data['Nc'].replace('*****', '0').astype('float')

    # Robust Scaling (less sensitive to outliers)
    scaler = RobustScaler()
    test_data[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']] = scaler.fit_transform(test_data[['ENC', "ENC' | all (bg)", 'Gravy', 'L_aa', 'L_sym']])

    return test_data

# Train the model with polynomial features
def train_model(X, y):
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X)

    model = LinearRegression()
    model.fit(X_poly, y)

    return model

if __name__ == "__main__":
    test_data = preprocess_data(test_data)

    X = test_data.drop('abundance', axis=1)
    y = test_data['abundance']

    model = train_model(X, y)

    # Transform the entire dataset with polynomial features
    X_poly = PolynomialFeatures(degree=2).fit_transform(X)

    # Predict on the entire dataset
    y_pred = model.predict(X_poly)

    mse = mean_squared_error(y, y_pred)
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mse)

    print("R-squared (R²):", r_squared)
    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", rmse)


R-squared (R²): 0.8866754247384357
Mean Absolute Error (MAE): 90.5217911727157
Root Mean Squared Error (RMSE): 133.16970540617461
