In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime


In [None]:
file_csv = '../raw_data/merge_dfs.csv'
df_raw_data = pd.read_csv(file_csv)

In [None]:
import pipeline_cleaning


In [None]:
clean_data = pipeline_cleaning.clean_data(df_raw_data)
clean_data

In [None]:
list(clean_data.columns)

In [None]:
clean_data.isna().sum()

In [None]:
pipeline_cleaning.transforming_data(clean_data)[0]

In [None]:
clean_data['date'] = pd.to_datetime(clean_data['date'])
clean_data.drop(columns=['jockey_id', 'tainer_id', 'margin', 'finish_position', 'event_number'], axis=1, inplace=True)
clean_data.dropna(inplace=True) #instead of imputer
df_train = clean_data[(clean_data['date'].dt.year != 2022) & (clean_data['date'].dt.year != 2023)]
df_val = clean_data[clean_data['date'].dt.year == 2022]
df_test = clean_data[clean_data['date'].dt.year == 2023]
df_train.drop(columns=['date'], axis=1, inplace=True)
df_val.drop(columns=['date'], axis=1, inplace=True)
df_test.drop(columns=['date'], axis=1, inplace=True)

categorical_col = ['barrier', 'track_condition', 'race_type', 'track_type',
                    'race_class_normalised', 'race_class']
num_col = ['distance', 'total_prize_money', 'jockey_allowance',
            'handicap_weight', 'dslr', 'official rating', 'wfa',
            'weight_adjustment', 'betfair_starting_price',
            'pre_race_master_rating_int', 'starting_price', 'current_age']


In [None]:
[col for col in df_train.columns if col not in (num_col+categorical_col)]

In [None]:
df_test

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [None]:
categorical_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='if_binary'))
])
numerical_preprocessor = Pipeline([
    ('scaler', StandardScaler())
])
pipeline = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_col),
    ('numerical', numerical_preprocessor, num_col)
], remainder="passthrough", sparse_threshold=0)
pipeline

In [None]:
pipeline.fit(df_train)
df_train_transformed = pipeline.transform(df_train)
df_val_transformed = pipeline.transform(df_val)
df_test_transformed = pipeline.transform(df_test)

In [None]:
pipeline.transform(df_train)

In [None]:
# pipeline.get_feature_names_out()


In [None]:
categorical_feature_names = pipeline.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(input_features=categorical_col)

# Obter os nomes das colunas numéricas
numerical_feature_names = num_col
remainder_col_names = [col for col in df_train.columns if col not in (num_col+categorical_col)]

# Combinar os nomes das colunas categóricas e numéricas
all_feature_names = list(categorical_feature_names) + numerical_feature_names + remainder_col_names
all_feature_names

In [None]:
df_test_transformed_with_columns = pd.DataFrame(df_test_transformed, columns=all_feature_names)
df_val_transformed_with_columns = pd.DataFrame(df_val_transformed, columns=all_feature_names)
df_train_transformed_with_columns = pd.DataFrame(df_train_transformed, columns=all_feature_names)

In [None]:
df_train_transformed_with_columns