In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import \
                accuracy_score, \
                classification_report, \
                cohen_kappa_score, \
                matthews_corrcoef, \
                confusion_matrix, \
                roc_auc_score

                
import xgboost as xgb

In [2]:
def eval_metrics(y_true, y_pred):
    kappa = metrics.cohen_kappa_score(y_true, y_pred)
    #print(f"Cohen's Kappa: {kappa:.2f}")
    
    # Calculate confusion matrix
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate TPR (True Positive Rate), TNR (True Negative Rate), Precision
    TPR = tp / (tp + fn)
    TNR = tn / (tn + fp)
    precision = tp / (tp + fp)
    
    # print(f"True Positive Rate (TPR): {TPR:.2f}")
    # print(f"True Negative Rate (TNR): {TNR:.2f}")
    # print(f"Precision: {precision:.2f}")

    return {'kappa':kappa,'TPR':TPR,'TNR':TNR,'precision':precision}

def evaluate_performance(y_true,y_pred,pred_prob='NA'):
    cr = classification_report(y_true,y_pred)
    filtered_cr = [ line for line in cr.split('\n') if len(line)!=0]

    precision_0 = float(filtered_cr[1].split()[1])
    recall_0 = float(filtered_cr[1].split()[2])
    f1_0 = float(filtered_cr[1].split()[3])
    
    precision_1 = float(filtered_cr[2].split()[1])
    recall_1 = float(filtered_cr[2].split()[2])
    f1_1 = float(filtered_cr[2].split()[3])

    balanced_accuracy = (recall_0 + recall_1)/2
    acc_score = accuracy_score(y_true,y_pred)
    cohen_kappa = cohen_kappa_score(y_true,y_pred)
    matthews_corrcoef_score = matthews_corrcoef(y_true,y_pred)
    
    if pred_prob != 'NA':
        roc_auc = roc_auc_score(y_true,pred_prob)
    else:
        roc_auc = pred_prob
    

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    temp = dict(
        precision_0 = precision_0,
        precision_1 = precision_1,
        recall_0 = recall_0,
        recall_1 = recall_1,
        f1_0 = f1_0,
        f1_1 = f1_1,
        accuracy = acc_score,
        balanced_accuracy = balanced_accuracy,
        cohen_kappa = cohen_kappa,
        matthews_corrcoef_score = matthews_corrcoef_score,
        roc_auc_score = roc_auc,
        
        tn = tn,
        fp = fp,
        fn = fn,
        tp = tp
    )

    return temp


## Data Loading

In [3]:
df = pd.read_csv(r"E:\HILLUL\Nagamani Sir\TAPOS-KU-INTERN\COMPILING-MTUNE-REQUIREMENTS\1 Cell-Line Dataset ghost VS mtune\TGF-B Train FP\AtomPairs2DCount.csv")
# df = pd.read_csv('/path/to/fingerprint.csv')
# Set a seed for reproducibility (e.g., 42)
seed = 42

# Shuffle the DataFrame with the seed
df = df.sample(frac=1, random_state=seed)
df['Name'].value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '/path/to/fingerprint.csv'

# Train - Test split

### 20 percent test set

In [None]:
train_df,test_df = train_test_split(df,test_size=0.2, stratify=df['Name'], random_state = 42 )

val_counts = train_df['Name'].value_counts()

majority_class_label = val_counts.idxmax()
minority_class_label = val_counts.idxmin()

X_train, y_train = train_df.drop('Name',axis=1), train_df['Name']
X_test, y_test =  test_df.drop('Name',axis=1), test_df['Name']

print(f'X_train ---> {minority_class_label} :',len([ label for label in y_train if label == minority_class_label]),end = '\t')
print(f'{majority_class_label} :',len( [ label for label in y_train if label == majority_class_label] ))

print(f'X_test ---> {minority_class_label} :',len([ label for label in y_test if label == minority_class_label]),end = '\t')
print(f'{majority_class_label} :',len( [ label for label in y_test if label == majority_class_label] ))

### Model Training

In [None]:
xgb_model = xgb.XGBClassifier(
    # **best_params
    random_state = 42
)
xgb_model.fit(X_train, y_train, verbose=0)

## Standard Threshold thres = 0.5

In [None]:
train_performance_df = pd.DataFrame()
train_pred_df = pd.DataFrame()

### Training Set Performance

In [None]:
pred_prob = xgb_model.predict_proba(X_train)[:,minority_class_label].tolist()
y_true = y_train.tolist()
y_pred = [minority_class_label if p >= 0.5 else majority_class_label for p in pred_prob]

train_pred_df['y_true'] = y_true
train_pred_df['pred_prob'] = pred_prob
train_pred_df['standard'] = y_pred

vals = dict(method = 'Standard', thres = '0.5')
temp = evaluate_performance(y_true,y_pred,pred_prob)
vals.update(temp)
train_performance_df[len(train_performance_df.columns)] = vals
train_performance_df

### Testing Set Performance

In [None]:
test_performance_df = pd.DataFrame()
test_pred_df = pd.DataFrame()

In [None]:
pred_prob = xgb_model.predict_proba(X_test)[:,minority_class_label].tolist()
y_true = y_test.tolist() 
y_pred = [ minority_class_label if p >= 0.5 else majority_class_label for p in pred_prob ]

test_pred_df['y_true'] = y_true
test_pred_df['pred_prob'] = pred_prob
test_pred_df['standard'] = y_pred

vals = dict(method = 'Standard', thres = '0.5')
temp = evaluate_performance(y_true,y_pred,pred_prob)

vals.update(temp)
test_performance_df[len(test_performance_df.columns)] = vals
test_performance_df

# M-Tune

In [None]:
mean_thres = train_pred_df['pred_prob'].mean()
mean_thres

In [None]:
train_pred_df.head()

### Training set Performance

In [None]:
train_pred_df['M-Tune'] = [minority_class_label if p >= mean_thres else majority_class_label for p in train_pred_df['pred_prob'].tolist() ]

pred_prob = train_pred_df['pred_prob'].tolist()
y_true = train_pred_df['y_true'].tolist()
y_pred = train_pred_df['M-Tune'].tolist()

vals = dict(method = 'M-Tune', thres = mean_thres)
temp = evaluate_performance(y_true,y_pred,pred_prob)

vals.update(temp)
train_performance_df[len(train_performance_df.columns)] = vals
train_performance_df

### Testing Set Performance

In [None]:
test_pred_df['M-Tune'] = [minority_class_label if p >= mean_thres else majority_class_label for p in test_pred_df['pred_prob'].tolist() ]

pred_prob = test_pred_df['pred_prob'].tolist()
y_true = test_pred_df['y_true'].tolist()
y_pred = test_pred_df['M-Tune'].tolist()

vals = dict(method = 'M-Tune', thres = mean_thres)
temp = evaluate_performance(y_true,y_pred,pred_prob)

vals.update(temp)
test_performance_df[len(test_performance_df.columns)] = vals
test_performance_df

# Visualization

In [None]:
# Extracting Classwise Prediction Probabilities
pred_proba = train_pred_df['pred_prob']
data0 = train_pred_df[train_pred_df['y_true']==0]['pred_prob']
data1 = train_pred_df[train_pred_df['y_true']==1]['pred_prob']

In [None]:
# Set the style for the plot
sns.set(style="darkgrid")

# Create a distribution plot
sns.histplot(pred_proba, kde=True
             #,label='Predicted Probabilities'
            )

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution Plot for NumPy Array')
plt.grid(False)
#plt.legend()
plt.savefig('TGF-B-AtomPairs2DCount - Pred-Prob.png',dpi=300)
# Show the plot
plt.show()

## Zooming in on Scale

In [None]:
upper_limit = 0.015
upper_limit = 0.07

# Filter the array to include only values between 0 and upper limit
filtered_data = pred_proba[(pred_proba >= 0) & (pred_proba <= upper_limit)]

# Plot the distribution using Seaborn
plt.figure(figsize=(10, 6),facecolor='w')
sns.histplot(filtered_data, bins=100, kde=True,stat='count')
plt.title(f'Distribution of Values Between 0 and {upper_limit}')
plt.xlabel('Probability')
plt.ylabel('Frequency')

plt.axvline(
    x=mean_thres,
    color ='b' , 
    linestyle='--',
    linewidth=2
    #,label= f"Mean - {round(t.loc['mean','whole'],4)}"
)
plt.grid(False)

# plt.legend()
plt.savefig('TGF-B-AtomPairs2DCount - Zoomed-Pred-Prob.png',dpi=300)
plt.show()

In [None]:
lower_limit = 0
upper_limit = 0.07

# Filter the array to include only values between 0 and 0.1
filtered_data0 = data0[(data0 >= lower_limit) & (data0 <= upper_limit)]
filtered_data1 = data1[(data1 >= lower_limit) & (data1 <= upper_limit)]

# Plot the distributions
plt.figure(figsize=(10, 6),facecolor='white')
# plt.style.use('seaborn-white')

# Plot for the first dataset
sns.histplot(
    #data0, 
    filtered_data0,
    bins=100, kde=True, color='blue',  label='Label 0', stat="density"
)

# Plot for the second dataset
sns.histplot(
    #data1, 
    filtered_data1,
    bins=100, kde=True, color='red',  label='Label 1', stat="density"
)

# Add labels and title
plt.xlabel('Probability')
plt.ylabel('Density')
plt.title('Distribution Plot for Two Arrays')
plt.legend()

plt.axvline(
    x=mean_thres,
    color ='g' , 
    linestyle='--',
    linewidth=2,
    label = f"Mean - {round(mean_thres,4)}"
)
plt.legend()
plt.grid(False)
plt.savefig('TGF-B-AtomPairs2DCount - Zoomed Class-wise-Prediction Probabality.png', dpi = 300 )
# Show the plot
plt.show()

## TP - Frequency

In [None]:
upper_limit = 0.03

def plot_distribution(data, title,figname,upper_limit=1):
    filtered_data = data[(data >= 0) & (data <= upper_limit)]
    
    plt.figure(figsize=(10, 6))
    sns.histplot(filtered_data, bins=100, kde=True, stat = 'count')
    plt.title(title)
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
    plt.axvline(x=mean_thres,color ='b' , linestyle='--',linewidth=2,
                #label=f"Mean - {round(t.loc['mean','whole'],4)}"
               )
    plt.grid(False)
    plt.savefig(figname+'.png',dpi=300)
    #plt.legend()
    plt.show()

In [None]:
confusion_matrix(test_pred_df['y_true'].tolist(),test_pred_df['M-Tune'].tolist())

In [None]:
# Filter for true positives
tp = test_pred_df[
    (test_pred_df['y_true'] == 1) & 
    (test_pred_df['M-Tune'] == test_pred_df['y_true'])
]

In [None]:
#Plotting TP distribution
print(len(tp))
plot_distribution(tp['pred_prob'], f'Distribution of True Positives','TGF-B-AtomPairs2DCount_TP')

In [None]:
# Filter for true negatives
tn = test_pred_df[
    (test_pred_df['y_true'] == 0) & 
    (test_pred_df['M-Tune'] == test_pred_df['y_true'])
]

In [None]:
print(len(tn))
#Plotting TN distribution
plot_distribution(tn['pred_prob'], 'Distribution of True Negatives','TGF-B-AtomPairs2DCount_TN',upper_limit=0.010)

In [None]:
# Filter for false positives
fp = test_pred_df[
    (test_pred_df['y_true'] == 0) & 
    (test_pred_df['M-Tune'] != test_pred_df['y_true'])
]

In [None]:
#Plotting FP distribution
print(len(fp))
plot_distribution(fp['pred_prob'], 'Distribution of False Positives','TGF-B-AtomPairs2DCount_FP')

In [None]:
# Filter for false negatives
fn = test_pred_df[
    (test_pred_df['y_true'] == 1) & 
    (test_pred_df['M-Tune'] != test_pred_df['y_true'])
]

In [None]:
#Plotting FN distribution
print(len(fn))
plot_distribution(fn['pred_prob'], 'Distribution of False Negatives','TGF-B-AtomPairs2DCount_FN')

## External Dataset Evaluation

In [None]:
# ext_df = pd.read_csv('/path/to/fingerprint.csv')
ext_df = pd.read_csv(r'E:\HILLUL\Nagamani Sir\TAPOS-KU-INTERN\COMPILING-MTUNE-REQUIREMENTS\1 Cell-Line Dataset ghost VS mtune\TGF-B External FP\External_AtomPairs2DCount.csv')
ext_df

In [None]:
ext_X = ext_df.drop('Name', axis=1)
ext_y = ext_df['Name']

In [None]:
ext_pred_df = pd.DataFrame()
ext_pred_df['y_true'] = ext_y.tolist()
ext_pred_df['pred_prob'] = xgb_model.predict_proba(ext_X)[:,minority_class_label].tolist()
ext_pred_df['Standard'] = [minority_class_label if p>= 0.5 else majority_class_label for p in ext_pred_df['pred_prob'].tolist() ]
ext_pred_df['M-Tune'] = [minority_class_label if p>= mean_thres else majority_class_label for p in ext_pred_df['pred_prob'].tolist() ]
ext_pred_df

In [None]:
ext_preformance_df = pd.DataFrame()

In [None]:
y_true = ext_pred_df['y_true'].tolist()
y_pred = ext_pred_df['Standard'].tolist()
pred_prob = ext_pred_df['pred_prob'].tolist()

vals = dict(method = 'Standard',thres= 0.5)
temp = evaluate_performance(y_true, y_pred, pred_prob ) 
vals.update(temp)
ext_preformance_df[len(ext_preformance_df.columns)] = vals
ext_preformance_df

In [None]:
y_true = ext_pred_df['y_true'].tolist()
y_pred = ext_pred_df['M-Tune'].tolist()
pred_prob = ext_pred_df['pred_prob'].tolist()

vals = dict(method = 'M-Tune',thres= mean_thres)
temp = evaluate_performance(y_true, y_pred, pred_prob) 
vals.update(temp)
ext_preformance_df[len(ext_preformance_df.columns)] = vals
ext_preformance_df