In [1]:
import pandas as pd
import snowflake.connector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
import optuna
import joblib

In [4]:

# Step 1: Connect to Snowflake and Read Data
def read_data_from_snowflake():
    conn = snowflake.connector.connect(
        user='mudit',
        password='Pappu@123',
        account='yw44584.ap-southeast-1',
        warehouse='COMPUTE_WH',
        database='CAPSTONE_PROJECT',
        schema='SPORTS'
    )
    query = "SELECT * FROM sports_data_final;"
    sports_data = pd.read_sql(query, conn)
    conn.close()
    return sports_data




In [5]:
# Step 2: Prepare Data
def prepare_data(data):
    # Create target column 'Winner'
    def determine_winner(row):
        if row['Team1_Score'] > row['Team2_Score']:
            return 1  # Team1 wins
        elif row['Team1_Score'] < row['Team2_Score']:
            return 0  # Team2 wins
        else:
            return 2  # Draw

    data['Winner'] = data.apply(determine_winner, axis=1)
    # Drop irrelevant columns
    features = data.drop(columns=['MatchID', 'Date', 'Location', 'PlayerName', 'Winner'])
    target = data['Winner']
    return features, target

In [6]:

# Step 3: Train and Optimize Models with Optuna
def objective(trial, X_train, y_train, X_valid, y_valid):
    model_name = trial.suggest_categorical("model", ["LogisticRegression", "RandomForest", "XGBoost"])
    
    if model_name == "LogisticRegression":
        C = trial.suggest_float("C", 0.01, 10.0)
        model = LogisticRegression(C=C, max_iter=1000)
        
    elif model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 10, 200)
        max_depth = trial.suggest_int("max_depth", 2, 20)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        
    elif model_name == "XGBoost":
        n_estimators = trial.suggest_int("n_estimators", 10, 200)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, use_label_encoder=False, eval_metric='mlogloss')
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    return accuracy

def train_optimize_models(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=30)
    
    # Retrieve and train the best model
    best_params = study.best_params
    print("Best parameters found by Optuna:", best_params)
    
    if best_params['model'] == "LogisticRegression":
        best_model = LogisticRegression(C=best_params['C'], max_iter=1000)
    elif best_params['model'] == "RandomForest":
        best_model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'])
    elif best_params['model'] == "XGBoost":
        best_model = XGBClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], use_label_encoder=False, eval_metric='mlogloss')
    
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print(f"Accuracy of best model: {accuracy:.2f}")
    
    return best_model, study.best_params

In [None]:
# Step 4: Save the Model
def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")


In [None]:
# Step 5: Point Prediction
def point_prediction(model, new_data):
    prediction = model.predict(new_data)
    return prediction

In [None]:
# Main Execution
if __name__ == "__main__":
    # Read data
    sports_data = read_data_from_snowflake()
    
    # Prepare features and target
    X, y = prepare_data(sports_data)
    
    # Train and optimize models
    best_model, best_params = train_optimize_models(X, y)
    
    # Save the best model
    save_model(best_model, "models/best_winner_model.sav")
    
    # Example: Point Prediction
    # Replace with your new data
    new_data = X.iloc[0:1]  # Sample new data
    prediction = point_prediction(best_model, new_data)
    print(f"Predicted Winner: {prediction}")