## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
import os
import pandas as pd
import json
import warnings
from joblib import load
warnings.filterwarnings('ignore')

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
PREDICTIONS_DIR = os.path.join(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE = os.path.join(PREDICTIONS_DIR, 'predictions.csv')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')

if not os.path.exists(PREDICTIONS_DIR):
    os.makedirs(PREDICTIONS_DIR)

### Reading the schema

In [3]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('.json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

### Reading test data.

In [4]:
file_name = [f for f in os.listdir(TEST_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TEST_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,number,color
0,RJD27C,1.8403,Blue
1,PFQ2ZK,7.2176,Green
2,Y5K92G,5.4254,
3,NPVTQJ,4.7513,Blue
4,3YMG1J,3.2551,Red


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [5]:
columns_with_missing_values = df.columns[df.isna().any()]
imputation_values = load(IMPUTATION_FILE)
for column in columns_with_missing_values:
    df[column].fillna(imputation_values[column], inplace=True)


In [6]:
df.head()

Unnamed: 0,id,number,color
0,RJD27C,1.8403,Blue
1,PFQ2ZK,7.2176,Green
2,Y5K92G,5.4254,Blue
3,NPVTQJ,4.7513,Blue
4,3YMG1J,3.2551,Red


### Encoding
We encode the data using the same encoder that we saved during training.

In [7]:
# Saving the id column in a different variable.
ids = df[id_feature]

# Dropping the id from the dataframe
df.drop(columns=[id_feature], inplace=True)

# Encoding the rest of the features if exist
if os.path.exists(OHE_ENCODER_FILE):
    encoder = load(OHE_ENCODER_FILE)
    df = encoder.transform(df)


### Making predictions
Using the model saved during training. Notice that the model outputs a 2D array with many rows and 5 columns. </br>
Each row in the array represents an answer to a sample in the test data. Each number of the 5 numbers in the row is a probability to one of the 5 classes in the original dataset.

In [12]:
model = load(PREDICTOR_FILE_PATH)
predictions = model.predict(df)

predictions

array([ 94.32295506, 167.90744702, 167.11385973, 153.42711492,
        35.44900241,  90.46321586,  64.48865327, 124.06255592,
       127.27997061, 172.5448191 , 146.38143181,  78.19976257,
        73.35200675, 116.22052854,  98.77353281,  39.65893251,
       137.09150831,  55.6988797 ,  92.45095109, 202.29819454,
       135.14969661, 125.72267447,  93.72477007, 124.06255592,
       118.97777516,  40.33504674, 159.65731466, 160.98010271,
        62.08546663, -25.48452358, 214.07435831, 195.33575673,
       215.54231803, 113.36147469,  64.58408081, 132.75994751,
        94.78868555,  78.65862688, 219.40205722,  90.70279989])

### Creating predictions DataFrame.

In [14]:
predictions_df = pd.DataFrame(columns=[id_feature, 'prediction'])
predictions_df[id_feature] = ids
predictions_df['prediction'] = predictions
predictions_df.to_csv(PREDICTIONS_FILE)
predictions_df.head()


Unnamed: 0,id,prediction
0,RJD27C,94.322955
1,PFQ2ZK,167.907447
2,Y5K92G,167.11386
3,NPVTQJ,153.427115
4,3YMG1J,35.449002
