In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Read data from csv file

df = pd.read_csv("C:\\Users\\91982\\Desktop\\Projects\\LIFE EXPECTANCY PREDICTION FOR POST THORACIC SURGERY - AI\\thoracic+surgery+data\\Thoracic_surgery_survival_prediction\\Data Preprocessing\\thoracic_surgery.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
live = df[df['Death_In_1yr'] == 0]
death = df[df['Death_In_1yr'] == 1]

condition = ['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis', 'Dyspnoea', 'Cough', 'Weakness',
             'Tumor_Size', 'Diabetes_Mellitus', 'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Age']

l = [np.mean(live[c]) for c in condition]
d = [np.mean(death[c]) for c in condition]

ld = pd.DataFrame(
    data={'Attribute': condition, 'Live 1yr Mean': l, 'Death 1yr Mean': d})
ld = ld.set_index('Attribute')

print('Death: {:d}, Live: {:d}'.format(len(death), len(live)))
print("1 year death: {:.2f}% out of 454 patients".format(
    np.mean(df.Death_In_1yr)*100))


ld

In [None]:
#Count plots of Diagnosis, Tumor_Size, Performance with difference of live and death data

fig, axes = plt.subplots(3,1,figsize=(10,15))

sns.countplot(x='Diagnosis', hue='Death_In_1yr', data=df, palette='Reds_d', ax=axes[0]).set_title('Diagnosis', fontsize=18)
sns.countplot(x='Tumor_Size', hue='Death_In_1yr', data=df, palette='Greens_d', ax=axes[1]).set_title('Tumor_Size', fontsize=18)
sns.countplot(x='Performance', hue='Death_In_1yr', data=df, palette='Blues_d', ax=axes[2]).set_title('Performance', fontsize=18)

plt.tight_layout()

In [None]:

# Correlation coefficients for FVC and FEV1
np.corrcoef(df.FVC, df.FEV1)[0, 1]

In [None]:

# Correlation coefficients for Age and FVC
np.corrcoef(df.Age, df.FVC)[0, 1]

In [None]:

# Correlation coefficients for Age and FEV1
np.corrcoef(df.Age, df.FEV1)[0, 1]

In [None]:
df.corr()

In [None]:
# Use Heat Map to see it well
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# Check the Skewness whether bell or gaussian distribution
# neg = left
# positive = right
# closer to 0 is less skew
df.skew()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

<h2>Logistic Regration Model creation</h2>

In [None]:
X = df.drop(['Death_In_1yr', 'MI_6mo', 'Asthma'], axis=1)
y = df['Death_In_1yr']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.3, random_state=7, stratify=y)


lr_model = LogisticRegression()

lr_model.fit(X_train, Y_train)

In [None]:
y_pred = lr_model.predict(X_test)
y_pred = y_pred.round().flatten()

In [None]:
cnf_matrix = confusion_matrix(Y_test, y_pred)
cnf_matrix

In [None]:
nor_cnf = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
print(nor_cnf)

In [None]:
classes = ['live', 'death']
print(classification_report(Y_test, y_pred, target_names=classes))

In [None]:


plt.imshow(nor_cnf, interpolation='nearest', cmap=plt.cm.Blues)

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, '{:.2f}'.format(
            nor_cnf[i, j]), horizontalalignment="center", color="black")


plt.title('Logistic Regration X')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.ylabel('True label')
plt.xlabel('Predicted label')

<h3>By analysing the confusion matrix we conclude that over model is overfited so we need feature scaling and regularization to overcome this error</h3>

<h2>Logistic Regration Model creation with Regularization, feature sacling and pipelining significant</h2>

In [None]:
X = df.drop(['MI_6mo', 'Death_In_1yr',  'Asthma'], axis=1)
y = df['Death_In_1yr']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30, random_state=4, stratify=y)


smote = SMOTE(random_state=4)


x_train_resamp, y_train_resamp = smote.fit_resample(X_train, Y_train)



In [None]:
# logistic regression model with regularization
lr_model = LogisticRegression(penalty='l2', C=1.0, random_state=4)
# Example: StandardScaler for scaling features
preprocessing = [('scaler', StandardScaler())]

# preprocessor pipeline
preprocessor = Pipeline(steps=preprocessing)

# Combine preprocessing steps with the model in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),('classifier', lr_model)])

In [None]:
model.fit(X_train, Y_train)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

In [None]:
cnf_matrix = confusion_matrix(Y_test, y_pred)
cnf_matrix

In [None]:
nor_cnf = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
print(nor_cnf)

In [None]:
classes = ['live', 'death']
print(classification_report(Y_test, y_pred, target_names=classes))

In [None]:


plt.imshow(nor_cnf, interpolation='nearest', cmap=plt.cm.Blues)

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, '{:.2f}'.format(
            nor_cnf[i, j]), horizontalalignment="center", color="black")


plt.title('Logistic Regration X')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
from joblib import dump
dump(model, 'logisticregration_model.joblib')

<h1>Random Forest Classification</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier

<h2>Hypothisis  Testing</h2>


In [None]:
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""
    permuted_data = np.random.permutation(np.concatenate((data1, data2)))
    return permuted_data[:len(data1)], permuted_data[len(data1):]


def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""
    perm_replicates = np.empty(size)
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
    return perm_replicates


def diff_of_means(data_1, data_2):
    """Difference in means of two arrays."""
    return np.mean(data_1) - np.mean(data_2)

In [None]:
# Hypothesis testing with Permutations of data
condition = ['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis', 'Dyspnoea', 'Cough', 'Weakness',
             'Tumor_Size', 'Diabetes_Mellitus', 'MI_6mo', 'PAD', 'Smoking', 'Asthma', 'Age']

p_values = []
for c in condition:
    empirical_diff_means = diff_of_means(death[c], live[c])
    perm_replicates = draw_perm_reps(
        death[c], live[c], diff_of_means, size=10000)
    p = np.sum(perm_replicates >= empirical_diff_means if empirical_diff_means >
               0 else perm_replicates <= empirical_diff_means) / len(perm_replicates)
    p_values.append(p)

print(list(zip(condition, p_values)))

In [None]:
X = df[['Performance', 'Dyspnoea', 'Cough', 'Tumor_Size', 'Diabetes_Mellitus']]
y = df['Death_In_1yr']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1111, stratify=y)
rfc_model = RandomForestClassifier(class_weight='balanced', random_state=1111)
rfc_model.fit(X_train,Y_train)

In [None]:
y_pred = rfc_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

In [None]:
# Confusion Matrix
cnf_matrix = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(cnf_matrix)

In [None]:
# Normalized Confusion Matrix
nor_cnf = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print("Normalized Confusion Matrix:")
print(nor_cnf)

In [None]:
# Classification Report
classes = ['live', 'death']
print("Classification Report:")
print(classification_report(Y_test, y_pred, target_names=classes))

In [None]:
# Plotting Confusion Matrix
plt.imshow(nor_cnf, interpolation='nearest', cmap=plt.cm.Blues)
for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, '{:.2f}'.format(
            nor_cnf[i, j]), horizontalalignment="center", color="black")
plt.title('Random Forest Classifier')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
from joblib import dump
dump(model, 'rendomforest_model.joblib')

<h1>Model testing</h1>

In [None]:
import pandas as pd
from joblib import load

# Load the trained model
model = load("C:\\Users\\91982\\Desktop\\Projects\\LIFE EXPECTANCY PREDICTION FOR POST THORACIC SURGERY - AI\\thoracic+surgery+data\\logisticregration_model.joblib")

# Prepare input data
original_data = {
    'Diagnosis': 3,
    'FVC': 3.8,
    'FEV1': 2.8,
    'Performance': 0,
    'Pain': 0,
    'Haemoptysis': 0,
    'Dyspnoea': 0,
    'Cough': 0,
    'Weakness': 0,
    'Tumor_Size': 4,
    'Diabetes_Mellitus': 0,
    'PAD': 0,
    'Smoking': 0,
    'Age': 60
}

# 'Diagnosis': 1,
#     'FVC': 0,
#     'FEV1': 0,
#     'Performance': 0,
#     'Pain': 1,
#     'Haemoptysis': 1,
#     'Dyspnoea': 1,
#     'Cough': 1,
#     'Weakness': 1,
#     'Tumor_Size': 0,
#     'Diabetes_Mellitus': 1,
#     'PAD': 1,
#     'Smoking': 1,
#     'Age': 30

# Create a DataFrame from the input data
input_df = pd.DataFrame([original_data])

# Make predictions
predictions = model.predict(input_df)

# Map predicted labels to 'live' or 'death'
predicted_class = ['live' if pred == 0 else 'death' for pred in predictions]

# Print predictions
print("Predicted class for the input data:", predicted_class)