In [8]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
#import plotly.express as px
from xgboost import XGBClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split, KFold


In [9]:
source_data= "data/Student_performance_data _.csv"

In [10]:
def data_load(source_file,unique_maxcount):
    #read the file to dataframe
    df = pd.read_csv(source_file)

    # Distinction is based on the number of different values in the column
    columns = list(df.columns)
    categoric_columns = []
    numeric_columns = []
    for i in columns:
        if len(df[i].unique()) > unique_maxcount:
            numeric_columns.append(i)   
        else:
            categoric_columns.append(i)
    # Assuming the first column is an ID or non-numeric feature
    numeric_columns = numeric_columns[1:]

    # Convert numeric columns to float64
    df[numeric_columns] = df[numeric_columns].astype('float64')

    return df

def features(df):
    # CHOOSE THE TARGET FEATURE HERE, IN THIS CASE IT IS 'GradeClass'
    X = df.drop(columns=['GradeClass', 'GPA', 'StudentID', 'Age'])
    y = df['GradeClass']

    # Splitting the data into training and testing sets (e.g., 80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    return X_train, X_test, y_train, y_test


In [11]:

# !pip install mlflow


# !pip install mlflow[pipelines] 
# !conda install -c conda-forge mlflow-pipelines

# !pip install mlflow[extras]

# !pip install mlflow-skinny

In [12]:
import mlflow

In [13]:
def prepare_model():
    
    X_train, X_test, y_train, y_test = features(data_load(source_data, 5))
    
    #XGBClassifier
    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f"XGBoost accuracy: {score:.2f}")

    with open("/workspaces/mlops-zoomcamp/env1/02-experiment-tracking/models/XBGClassifier.bin", "wb") as f_out:
        pickle.dump((clf), f_out)

    
    #RandomForestClassifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f"RandomForest accuracy: {score:.2f}")


    #SVC
    clf = SVC()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f"Support Vector Machine accuracy: {score:.2f}")


    with mlflow.start_run():
        mlflow.set_tag("developer", "Kasi")
        mlflow.log_param("train-data-path", "/workspaces/mlops-zoomcamp/env1/02-experiment-tracking/data/Student_performance_data _.csv")
        
        model = SVC()
        
        # Define evaluation
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        #Define grid
        
        grid = {'C': [0.1, 1, 10, 100, 1000],  
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'kernel': ['rbf']
        }
        
        search = GridSearchCV(estimator=model, param_grid=grid, cv=cv, scoring='accuracy', n_jobs=-1)
        result = search.fit(X_train, y_train)

        best_score = result.best_score_
        best_hyperparameters = result.best_params_
        
        print('Best Score:',best_score)
        print('Best Hyperparameters:',best_hyperparameters)
        
        mlflow.log_metric("best_score", best_score.real)
        for param_name, param_value in best_hyperparameters.items():
            mlflow.log_param(param_name, param_value)
        
        mlflow.log_artifact(local_path="/workspaces/mlops-zoomcamp/env1/02-experiment-tracking/models/XBGClassifier.bin", artifact_path="model")
        mlflow.sklearn.save_model(model, "my_mode3")
    
    

In [14]:
#setting up mlflow 
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("student-perform-experiment1")

experiment_id="5"
mlflow.set_experiment(experiment_id=experiment_id)

prepare_model()




XGBoost accuracy: 0.67
RandomForest accuracy: 0.71
Support Vector Machine accuracy: 0.72


Best Score: 0.7586651032577082
Best Hyperparameters: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
