In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import numpy as np

In [18]:

dt = pd.read_csv("./dataCleaned.csv")

train = dt[(dt["season"] < 2022) & (dt["season"] >= 2015)]
test = dt[dt["season"] >= 2022]

x_train = train.drop(["Result_code", "Unnamed: 0"], axis=1)
y_train = train["Result_code"]
x_test = test.drop(["Result_code", "Unnamed: 0"], axis=1)
y_test = test["Result_code"]

# Randomforest

In [None]:

with mlflow.start_run():
    rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
    rf.fit(x_train, y_train)
    predict_rf = rf.predict(x_test)
    acc_rf = accuracy_score(y_test, predict_rf)
    print(classification_report(y_test, predict_rf))

    mlflow.log_param("n_estimators", 50)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_metric("accuracy", acc_rf)


# SVM 

In [None]:

with mlflow.start_run():
    svm = SVC(kernel='linear', C=1.0)
    svm.fit(x_train, y_train)
    pred_svm = svm.predict(x_test)
    acc_svm = accuracy_score(y_test, pred_svm)
    print(classification_report(y_test, pred_svm))

    mlflow.log_metric("accuracy", acc_svm)


## XGBoost

In [None]:


with mlflow.start_run():
    model = XGBClassifier()
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'n_estimators': [50, 100, 200]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
    grid_search.fit(x_train, y_train)
    best_model = grid_search.best_estimator_
    test_predictions = best_model.predict(x_test)
    print(classification_report(y_test, test_predictions))

    mlflow.log_params(grid_search.best_params_)
   






## Light GBM

In [None]:
with mlflow.start_run():
    mlflow.autolog()
    train_data = lgb.Dataset(x_train, label=y_train)
    test_data = lgb.Dataset(x_test, label=y_test, reference=train_data)
    params = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }
    num_round = 100
    model = lgb.train(params, train_data, num_round, valid_sets=[test_data])
    y_pred_prob = model.predict(x_test, num_iteration=model.best_iteration)
    predGBM = np.argmax(y_pred_prob, axis=1)

    accuracy = accuracy_score(y_test, predGBM)
    print(classification_report(y_test, predGBM))

    mlflow.log_param("num_leaves", 31)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_metric("accuracy", accuracy)

## CatBoost

In [None]:
with mlflow.start_run():
    mlflow.autolog()
    model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass')
    model.fit(x_train, y_train.astype(str), verbose=10)
    predCat = model.predict(x_test)

    accuracy = accuracy_score(y_test.astype(str), predCat)
    print(classification_report(y_test.astype(str), predCat))

    mlflow.log_metric("accuracy", accuracy)