In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
#import matplotlib.pyplot as plt
import mlflow

%matplotlib inline

In [11]:
!pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [2]:
df = pd.read_csv('diabetes.csv')

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df['Glucose'] = df['Glucose'].replace(0, df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0, df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0, df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0, df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0, df['BMI'].mean())
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].replace(0, df['DiabetesPedigreeFunction'].mean())

In [4]:
df['BMI'] = df['BMI'].astype('int64')
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].astype('int64')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [6]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [7]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=.20, random_state=42)

In [8]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Define parameter grid
param = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 8, 10],
    'max_leaf_nodes': [10, 20, 30, 50, 70, 100, 120, 150],
    'max_features': [2, 3, 4, 5, 6, 7, 8]
}

# Start MLflow run
with mlflow.start_run(run_name="DecisionTree_GridSearch"):

    clf = DecisionTreeClassifier()
    grid = GridSearchCV(clf, param_grid=param, cv=10, n_jobs=-1)
    grid.fit(X_train, y_train)

    # Get best estimator and evaluate
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    train_score = best_model.score(X_train, y_train)

    # Log all best parameters
    mlflow.log_params(grid.best_params_)

    # Log metrics
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("train_accuracy", train_score)
    mlflow.log_metric("best_cv_score", grid.best_score_)

    # Log the model
    mlflow.sklearn.log_model(best_model, artifact_path="decision_tree_model")

    print(f"Run ID: {mlflow.active_run().info.run_id}")
    print("Best Params:", grid.best_params_)
    print("Test Accuracy:", accuracy)




Run ID: 258d7de635d1401ca05b242689f53eb9
Best Params: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 6, 'max_leaf_nodes': 10}
Test Accuracy: 0.7207792207792207


In [16]:
import mlflow
import mlflow.sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Step 1: Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# Step 3: Start MLflow run
with mlflow.start_run(run_name="SVM_GridSearch"):

    svm = SVC()
    svm.fit(X_train, y_train)

    best_model = svm.best_estimator_
    y_pred = svm.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    train_accuracy = best_model.score(X_train, y_train)
    cm = confusion_matrix(y_test, y_pred)


    # Log hyperparameters
    mlflow.log_params(grid.best_params_)

    # Log metrics
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("train_accuracy", train_accuracy)

    # Log model
    mlflow.sklearn.log_model(svm, artifact_path="svm_model")

    print(f"Run ID: {mlflow.active_run().info.run_id}")
    print("Best Params:", svm.best_params_)
    print("Test Accuracy:", accuracy)




Run ID: 4a27c2803971478cb27b3fdd85198f46
Best Params: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Test Accuracy: 0.7662337662337663
