In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import(
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

In [None]:
df=pd.read_csv('D:/AI & ML/ML project/liver disecs prediction/dataset/Liver Patient Dataset (LPD)_train.csv')
df.head()

In [None]:
AAP=202
SGPT=22
TB=0.9
DB=0.3
SGOT=19
AGR=1.2
ALBA=4.1
AGE=65
GEN=1
TP=6.8

input=np.array([AGE,GEN,TB,DB,AAP,SGPT,SGOT,TP,ALBA,AGR])
columns=['AGE','GEN','TB','DB','AAP','SGPT','SGOT','TP','ALBA','AGR']
input_df =pd.DataFrame(input.reshape(1,-1),columns=columns)

In [None]:
df=pd.read_csv('D:/AI & ML/ML project/liver disecs prediction/dataset/Liver Patient Dataset (LPD)_train.csv',encoding='unicode_escape')
print(df.columns)
print(input_df.columns)

In [None]:
r={'Age of the patient':'AGE','Gender of the patient':'GEN','Total Bilrirubin':'TB','Direct Bilirubin':'DB',
  'Alkphos Alkaline Phosphotase':'AAP','Sgpt Alamine Aminotransferase':'SGPT',
  'Sgot Aspartate Aminotransferase':'SGOT','Total Protiens':'TP','ALB Albumin':'ALBA',
  'A/G Ratio Albumin and Globulin Ratio':'AGR','Result':'Result'}

In [None]:
df.rename(columns=r,inplace=True)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
d_target={1:1,2:0}
df.replace({'Result':d_target},inplace=True)
df.Result.value_counts()

In [None]:
df.describe()

In [None]:
df_num=df.select_dtypes(include=[np.number])
df_cat=df.select_dtypes(include=[object])
num_cols=df_num.columns.values[:-1]
cat_cols=df_cat.columns.values
print(num_cols)
print(cat_cols)

In [None]:
df.head(10)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.describe()

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.isna().mean().mul(100).sort_values(ascending=False)

In [None]:
for col in num_cols:
    if df[col].isna().mean()*100<15:
        cur_mean=np.mean(df[col])
        df[col]=df[col].fillna(cur_mean)

In [None]:
df.isna().mean().mul(100).sort_values(ascending=False)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
df.isna().mean().mul(100).sort_values(ascending=False)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.Result.value_counts()

In [None]:
enc=LabelEncoder()
df['GEN']=enc.fit_transform(df['GEN'])

In [None]:
for feature in df.columns:
    print(feature,":",len(df[feature].unique()))

In [None]:
print(df['GEN'].unique())

In [None]:


# Iterate through each column
for column in  num_cols:
    # Calculate quartiles and IQR
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    
    # Calculate box plot boundaries
    box_min = q1 - (1.5 * iqr)
    box_max = q3 + (1.5 * iqr)
    
    # Display results
    print(f"Column: {column}")
    print(f"Q1 (First Quartile): {q1}")
    print(f"Q3 (Third Quartile): {q3}")
    print(f"IQR (Interquartile Range): {iqr}")
    print(f"Box Plot Minimum: {box_min}")
    print(f"Box Plot Maximum: {box_max}")
    print("--------------------------")


In [None]:
for col in num_cols:
    Q1, Q3 = df.loc[:,col].quantile([0.25,0.75]).values
    IQR = Q3-Q1
    box_max = Q3+(1.5*IQR)
    box_min = Q1-(1.5*IQR)
    df.loc[df[col]<box_min, col] = np.NaN
    df.loc[df[col]>box_max, col] = np.NaN

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.describe()

In [None]:
df.Result.value_counts()

In [None]:
df.shape

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6, 4))  # Adjust the figure size if needed
sns.countplot(x='Result', data=df, palette='pastel')
plt.title('Distribution of Result')
plt.xlabel('Result')
plt.ylabel('Count')
plt.show()

In [None]:
df.head(10)

In [None]:
print(type(num_cols))
print(type(cat_cols))

In [None]:
print(type(df))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [None]:
X=df.drop(['Result'],axis=1)
y=df.Result

In [None]:
X

In [None]:
y

In [None]:
skf=StratifiedKFold(n_splits=5, shuffle=True,random_state=15)

In [None]:
scaler=StandardScaler()

In [None]:
metrics=pd.DataFrame(index=['Accuracy','Precission','Recall','F1_score','ROC_AUC'])

In [None]:
confussion_matrix_list=[]

In [None]:
random_forest_best_params={'boostrap':True,'criterion':'log_loss',
                          'max_depth':70,'max_features':'log2',
                          'n_estimators':500}
random_forest_best_score=0.9997873910277993

In [None]:
random_forest_best_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'bootstrap': True,  # Ensure this is spelled correctly
    # Add other parameters as needed
}

random_forest = RandomForestClassifier(**random_forest_best_params)

Accuracy = []
Precission = []
Recall = []
F1_score = []
ROC_AUC = []


In [None]:
# Perform stratified k-fold cross-validation

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply StandardScaler to numerical columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
    
    # Optionally, you can include GEN column after scaling
    # Add scaled GEN column back to X_train_scaled and X_test_scaled
    X_train_scaled['GEN'] = X_train['GEN']
    X_test_scaled['GEN'] = X_test['GEN']
    
    # Train classifier
    random_forest.fit(X_train_scaled, y_train)
    
    # Predict and evaluate
    random_forest_y_pred = random_forest.predict(X_test_scaled)
    random_forest_y_score = random_forest.predict_proba(X_test_scaled)[:, 1]

    Accuracy.append(accuracy_score(y_test, random_forest_y_pred))
    Precission.append(precision_score(y_test, random_forest_y_pred))
    Recall.append(recall_score(y_test, random_forest_y_pred))
    F1_score.append(f1_score(y_test, random_forest_y_pred))
    ROC_AUC.append(roc_auc_score(y_test, random_forest_y_score))
    print("-----------------------------------------------------------------------------------------------------")
confussion_matrix_list.append(y_test)
confussion_matrix_list.append( random_forest_y_pred)


In [None]:
metrics["Random Forest"] = [
    np.mean(Accuracy),
    np.mean(Precission),
    np.mean(Recall),
    np.mean(F1_score),
    np.mean(ROC_AUC)
]

In [None]:
print(metrics["Random Forest"])

In [None]:
svm_best_params = {'C': 15, 'kernel': 'rbf'}
svm_best_score = 0.9000298490572322


In [None]:
# SVM
svm = SVC(**svm_best_params, probability=True)
skf1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

In [None]:
# Perform stratified k-fold cross-validation
for train_index, test_index in skf1.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply StandardScaler to numerical columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
    
    # Optionally, you can include GEN column after scaling
    # Add scaled GEN column back to X_train_scaled and X_test_scaled
    X_train_scaled['GEN'] = X_train['GEN']
    X_test_scaled['GEN'] = X_test['GEN']
    
    # Train classifier
    svm.fit(X_train_scaled, y_train)

    # Predict and evaluate
    svm_y_pred = svm.predict(X_test_scaled)
    svm_y_score = svm.predict_proba(X_test_scaled)[:, 1]
    
    Accuracy.append(accuracy_score(y_test, svm_y_pred))
    Precission.append(precision_score(y_test, svm_y_pred))
    Recall.append(recall_score(y_test, svm_y_pred))
    F1_score.append(f1_score(y_test, svm_y_pred))
    ROC_AUC.append(roc_auc_score(y_test, svm_y_score))
    print("-----------------------------------------------------------------------------------------------------")
    
confussion_matrix_list.append(y_test)
confussion_matrix_list.append(svm_y_pred)


In [None]:
metrics["SVM"] = [
    np.mean(Accuracy),
    np.mean(Precission),
    np.mean(Recall),
    np.mean(F1_score),
    np.mean(ROC_AUC)
]

In [None]:
print(metrics["SVM"])

In [None]:
lr_best_params = {'C': 1000.0, 'l1_ratio': 0.1, 'max_iter': 850, 'penalty': 'elasticnet', 'solver': 'saga'}
lr_best_score = 0.6462995456766315


In [None]:
# Logistic Regression
lr = LogisticRegression(**lr_best_params, class_weight="balanced")
skf3 = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

for train_index, test_index in skf3.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Apply StandardScaler to numerical columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])
    
    # Optionally, you can include GEN column after scaling
    # Add scaled GEN column back to X_train_scaled and X_test_scaled
    X_train_scaled['GEN'] = X_train['GEN']
    X_test_scaled['GEN'] = X_test['GEN']
    
    # Train classifier
    lr.fit(X_train_scaled, y_train)

    lr_y_pred = lr.predict(X_test_scaled)
    lr_y_score = lr.predict_proba(X_test_scaled)[:, 1]
    
    Accuracy.append(accuracy_score(y_test, lr_y_pred))
    Precission.append(precision_score(y_test, lr_y_pred) )
    Recall.append(recall_score(y_test, lr_y_pred))
    F1_score.append(f1_score(y_test, lr_y_pred))
    ROC_AUC.append(roc_auc_score(y_test, lr_y_score))
    print("-----------------------------------------------------------------------------------------------------")
confussion_matrix_list.append(y_test)
confussion_matrix_list.append(lr_y_pred)
    

In [None]:
metrics["Logistic Regression"] = [
    np.mean(Accuracy),
    np.mean(Precission),
    np.mean(Recall),
    np.mean(F1_score),
    np.mean(ROC_AUC)
]

In [None]:
print(metrics["Logistic Regression"])

In [None]:
print(metrics)

In [None]:

metrics = metrics.transpose()

# Plot the bar chart
metrics.plot(kind='bar', figsize=(10, 6))
plt.title('Model Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.legend(title='Models')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.show()
plt.savefig('performance_nonum.png')


In [None]:
plt.figure(figsize=(10, 6))

ax = metrics.plot(kind='bar', figsize=(15, 8))

for i in ax.patches:
   ax.annotate(f"{i.get_height()*100:.1f}", 
                (i.get_x() + i.get_width() / 2., i.get_height()),
                ha='center', va='center',
                xytext=(0, 9),
                textcoords='offset points')

plt.title('Model Performance Metrics')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.show()
plt.savefig('performance_num.png')

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.heatmap(confusion_matrix(confussion_matrix_list[0], confussion_matrix_list[1]), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True), fmt="d", ax=ax, annot=True)
plt.title("Confusion Matrix Random Forest")

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.heatmap(confusion_matrix(confussion_matrix_list[2], confussion_matrix_list[3]), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True), fmt="d", ax=ax, annot=True)
plt.title("Confusion Matrix SVM")

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.heatmap(confusion_matrix(confussion_matrix_list[4], confussion_matrix_list[5]), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True), fmt="d", ax=ax, annot=True)
plt.title("Confusion MatrixLogistic Regression")

In [None]:
from sklearn.metrics import RocCurveDisplay
disp=RocCurveDisplay.from_estimator(svm, X_test_scaled, y_test)
RocCurveDisplay.from_estimator(random_forest, X_test_scaled, y_test, ax = disp.ax_);
RocCurveDisplay.from_estimator(lr, X_test_scaled, y_test, ax=disp.ax_);
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.savefig('Auc_Roc.png')


In [None]:
best_model = random_forest