In [137]:

#Current BestFit


import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load the dataset
Liver_df = pd.read_csv('ILPD_Set.csv')
Liver_df['Selector'] = Liver_df['Selector'] - 1

# Liver_df = pd.concat([Liver_df] * 4, ignore_index=True)

# Step 2: Fill missing values in 'A/G Ratio' based on correlation with 'ALB'
def fill_agr_missing(row):
    if pd.isnull(row['A/G Ratio']):
        return row['ALB'] * correlation
    return row['A/G Ratio']

non_null_df = Liver_df.dropna(subset=['ALB', 'A/G Ratio'])
correlation = non_null_df['ALB'].corr(non_null_df['A/G Ratio'])
Liver_df['A/G Ratio'] = Liver_df.apply(fill_agr_missing, axis=1)

# Step 3: Encode gender to numeric values
Liver_df["Gender"] = Liver_df["Gender"].map({'Male': 0, 'Female': 1})

# Step 4: Identify and remove outliers using the IQR method
Q1 = Liver_df.quantile(0.10)
Q3 = Liver_df.quantile(0.90)
IQR = Q3 - Q1
outlier_step = 1.5
outliers = ((Liver_df < (Q1 - outlier_step * IQR)) | (Liver_df > (Q3 + outlier_step * IQR))).DB
df_clean = Liver_df[~outliers]

# Step 5: Impute missing values with median
imputer = SimpleImputer(strategy='median')
df_filled = pd.DataFrame(imputer.fit_transform(df_clean), columns=df_clean.columns)

# Step 6: Split features and target variable
Liver_df_expanded = df_filled.drop(["DB"], axis=1)
Liver_df_expanded.rename(columns={'Selector': 'Outcome'}, inplace=True)
y = Liver_df_expanded['Outcome'].copy()
X = Liver_df_expanded.drop('Outcome', axis=1)

# Step 7: Normalization/Standardization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 8 Balance classes using SMOTE
X_balanced, y_balanced = SMOTE().fit_resample(X, y)

# Split the data into training, validation, and test sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Build the Random Forest model within a pipeline with StandardScaler and RandomForestClassifier
model = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, random_state=42))
])

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

# Train the model on the entire training set
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_val = model.predict(X_val)

# Validation accuracy
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Evaluate the model on the test set
y_pred_test = model.predict(X_test)

# Test accuracy
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)


Cross-Validation Mean Accuracy: 0.7715131578947367
Validation Accuracy: 0.7610062893081762
Test Accuracy: 0.81875


In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
Liver_df = pd.read_csv('ILPD_Set.csv')
Liver_df['Selector'] = Liver_df['Selector'] - 1

# Step 2: Fill missing values in 'A/G Ratio' based on correlation with 'ALB'
def fill_agr_missing(row):
    if pd.isnull(row['A/G Ratio']):
        return row['ALB'] * correlation
    return row['A/G Ratio']

non_null_df = Liver_df.dropna(subset=['ALB', 'A/G Ratio'])
correlation = non_null_df['ALB'].corr(non_null_df['A/G Ratio'])
Liver_df['A/G Ratio'] = Liver_df.apply(fill_agr_missing, axis=1)

# Step 3: Encode gender to numeric values
Liver_df["Gender"] = Liver_df["Gender"].map({'Male': 0, 'Female': 1})

# Step 4: Identify and remove outliers using the IQR method
Q1 = Liver_df.quantile(0.10)
Q3 = Liver_df.quantile(0.90)
IQR = Q3 - Q1
outlier_step = 1.5
outliers = ((Liver_df < (Q1 - outlier_step * IQR)) | (Liver_df > (Q3 + outlier_step * IQR))).DB
df_clean = Liver_df[~outliers]

# Step 5: Impute missing values with median
imputer = SimpleImputer(strategy='median')
df_filled = pd.DataFrame(imputer.fit_transform(df_clean), columns=df_clean.columns)

# Step 6: Split features and target variable
Liver_df_expanded = df_filled.drop(["DB"], axis=1)
Liver_df_expanded.rename(columns={'Selector': 'Outcome'}, inplace=True)
y = Liver_df_expanded['Outcome'].copy()
X = Liver_df_expanded.drop('Outcome', axis=1)

# Step 7: Normalization/Standardization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 8 Balance classes using SMOTE
X_balanced, y_balanced = SMOTE().fit_resample(X, y)

# Split the data into training, validation, and test sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define models with hyperparameter grids
models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [10, 20, 30],
            "model__min_samples_leaf": [1, 2, 4],
            "model__min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 5, 7],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4],
            "model__learning_rate": [0.01, 0.1, 0.5]
        }
    }
}

# Train and evaluate each model with hyperparameter tuning
for name, model_info in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model_info["model"])
    ])
    clf = GridSearchCV(pipeline, model_info["params"], cv=5, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    
    # Print best parameters
    print(f"{name} Best Parameters:", clf.best_params_)

    # Evaluate on validation set
    y_pred_val = clf.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    print(f"{name} Validation Accuracy:", accuracy_val)

    # Evaluate on test set
    y_pred_test = clf.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    print(f"{name} Test Accuracy:", accuracy_test)


Random Forest Best Parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 50}
Random Forest Validation Accuracy: 0.7672955974842768
Random Forest Test Accuracy: 0.7625
Gradient Boosting Best Parameters: {'model__learning_rate': 0.5, 'model__max_depth': 7, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Gradient Boosting Validation Accuracy: 0.8176100628930818
Gradient Boosting Test Accuracy: 0.825


In [111]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
Liver_df = pd.read_csv('ILPD_Set.csv')
Liver_df['Selector'] = Liver_df['Selector'] - 1

# Step 2: Fill missing values in 'A/G Ratio' based on correlation with 'ALB'
def fill_agr_missing(row):
    if pd.isnull(row['A/G Ratio']):
        return row['ALB'] * correlation
    return row['A/G Ratio']

non_null_df = Liver_df.dropna(subset=['ALB', 'A/G Ratio'])
correlation = non_null_df['ALB'].corr(non_null_df['A/G Ratio'])
Liver_df['A/G Ratio'] = Liver_df.apply(fill_agr_missing, axis=1)

# Step 3: Encode gender to numeric values
Liver_df["Gender"] = Liver_df["Gender"].map({'Male': 0, 'Female': 1})

# Step 4: Identify and remove outliers using the IQR method
Q1 = Liver_df.quantile(0.10)
Q3 = Liver_df.quantile(0.90)
IQR = Q3 - Q1
outlier_step = 1.5
outliers = ((Liver_df < (Q1 - outlier_step * IQR)) | (Liver_df > (Q3 + outlier_step * IQR))).DB
df_clean = Liver_df[~outliers]

# Step 5: Impute missing values with median
imputer = SimpleImputer(strategy='median')
df_filled = pd.DataFrame(imputer.fit_transform(df_clean), columns=df_clean.columns)

# Step 6: Split features and target variable
Liver_df_expanded = df_filled.drop(["DB"], axis=1)
Liver_df_expanded.rename(columns={'Selector': 'Outcome'}, inplace=True)
y = Liver_df_expanded['Outcome'].copy()
X = Liver_df_expanded.drop('Outcome', axis=1)

# Step 7: Normalization/Standardization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 8 Balance classes using SMOTE
X_balanced, y_balanced = SMOTE().fit_resample(X, y)

# Split the data into training, validation, and test sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define Gradient Boosting model with specified hyperparameters
model = GradientBoostingClassifier(learning_rate=0.5, max_depth=7, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=42)

# Build the model within a pipeline with StandardScaler
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', model)
])

# Train the model on the entire training set
pipeline.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_val = pipeline.predict(X_val)

# Validation accuracy
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Evaluate the model on the test set
y_pred_test = pipeline.predict(X_test)

# Test accuracy
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.8113207547169812
Test Accuracy: 0.8375
