In [2]:
# import only required libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# function to apply log transformation technique on dataset
def apply_log_transformation(df_original):
    df = df_original.copy() # get a copy of the dataset
    for column in df.columns.to_list(): # apply log transformation to all columns
        df[column] = df[column].map(lambda value : np.log(value) if value > 0 else 0)
    return df

# function to scale an entire dataset, returns only numeric columns scaled
def standard_scale_dataset(df):
    # get numerical columns
    numerical_columns = get_numerical_columns(df)
    
    # scale the dataset using a StandardScaler
    scaler = StandardScaler()
    df_numeric = df[numerical_columns]
    df_scaled = scaler.fit_transform(df_numeric.to_numpy())
    df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns.to_list())
    return df_scaled

# function to extract polynomial features from a dataset
def extract_polynomial_features(df, degree=2, test_size=0.3):
    polynomial = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=False)
    features_polynomial = polynomial.fit_transform(df) # x = df, there's no y
    return pd.DataFrame(features_polynomial)

# function to load a model from a pickle file
def load_model(path):
    return pickle.load(open(path, 'rb'))

In [3]:
def make_prediction(df, model_path):
    
    df = apply_log_transformation(df) # apply log transformation on columns with outliers
    
    df_scaled = standard_scale_dataset(df) # apply standard scaling to the dataset (excludes non-numeric columns)
    
    complex_df = extract_polynomial_features(df_scaled, degree=2) # extract polynomial features
    
    model = load_model(model_path) # load the model
    
    y_pred = model.predict(complex_df) # predict the new data target
    
    return y_pred

df_test = pd.read_csv('test.csv') # load the new data into a dataframe

model_path = '3_GradientBoostingClassifier.pkl'

make_prediction(df_test, model_path)

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

In [18]:
# ignore this
['team1_hero1',
 'team1_hero2',
 'team1_hero3',
 'team1_hero4',
 'team1_hero5',
 'team2_hero1',
 'team2_hero2',
 'team2_hero3',
 'team2_hero4',
 'team2_hero5',
 'team1_id',
 'team2_id'
]
[
 'team1_hero1_wins', # discarded
 'team1_hero2_wins', # discarded
 'team1_hero4_wins', # discarded
]
[
 'team1_rating',
 'team2_rating',
 'team1_hero3_wins',
 'team1_hero5_wins',
 'team2_hero1_wins',
 'team2_hero2_wins',
 'team2_hero3_wins',
 'team2_hero4_wins',
 'team2_hero5_wins'
]

['team1_rating',
 'team2_rating',
 'team1_hero3_wins',
 'team1_hero5_wins',
 'team2_hero1_wins',
 'team2_hero2_wins',
 'team2_hero3_wins',
 'team2_hero4_wins',
 'team2_hero5_wins']