In [None]:
import pandas as pd

In [None]:
# Constantes
PREVIOUS_RESULTS_USED = 5

In [None]:
raw_dfs = {
    'races': pd.read_parquet('.data_parquet/races.parquet'),
    'results': pd.read_parquet('.data_parquet/results.parquet'),
    'driver_standings': pd.read_parquet('.data_parquet/driver_standings.parquet'),
    'constructor_standings': pd.read_parquet('.data_parquet/constructor_standings.parquet'),
    'qualifying': pd.read_parquet('.data_parquet/qualifying.parquet')
}
display(raw_dfs['races'])

In [None]:
races_by_number = raw_dfs['races'][['year', 'round']].sort_values(['year', 'round'])
races_by_number['number'] = range(1, len(races_by_number) + 1)
races_number = races_by_number[['number']]
races_df_with_number = raw_dfs['races'].join(races_by_number['number'], on='raceId')
display(races_df_with_number)

In [None]:
# Parâmetros pensados
# - Últimas posições de largada do piloto
# - Últimas posições de chegada do piloto
# - Histórico de classificação do piloto
# - Posição "média" de largada da equipe nas últimas corridas
# - Posição "média" de chegada da equipe nas últimas corridas
# - Histórico de classificação da equipe
# Agrupando todas as informações necessárias no df de resultados
results_df = raw_dfs['results']
joined_driver_standings_df = (
    raw_dfs['driver_standings'][['position']].rename(
        columns={'position': 'driver_standings_position'}
    )
)
joined_constructor_standings_df = (
    raw_dfs['constructor_standings'][['position']].rename(
        columns={'position': 'constructor_standings_position'}
    )
)
joined_race_df = (
    races_df_with_number[['number', 'year']]
)
results_df = results_df.join(joined_driver_standings_df)
results_df = results_df.join(joined_constructor_standings_df, on=['raceId', 'constructorId'])
results_df = results_df.join(joined_race_df)
results_df = results_df.reset_index()
results_df = results_df.set_index(['driverId', 'number'])
results_df['driverId'] = results_df.index.get_level_values(0)
results_df = results_df.sort_index()
results_df['driverId_shifted'] = results_df.groupby(level=0)['driverId'].shift(PREVIOUS_RESULTS_USED)
results_df['driverId_equals'] = results_df['driverId'] == results_df['driverId_shifted']
results_df['year_shifted'] = results_df['year'].shift(PREVIOUS_RESULTS_USED)
results_df['year_equals'] = results_df['year'] == results_df['year_shifted']
display(results_df)

In [None]:
results_df[results_df['driverId_equals'] & results_df['year_equals']]

In [None]:
tmp_df = raw_dfs['results']
tmp_df = tmp_df.reset_index()
tmp_df = tmp_df.set_index(['raceId', 'constructorId']).sort_index()
tmp_df = (
    tmp_df
    .groupby(['raceId', 'constructorId'])
    [['grid', 'position']]
    .mean()
)
tmp_df = tmp_df.rename(columns={
    'grid': 'equivalent_grid_constructor',
    'position': 'equivalent_position_constructor'
})
tmp_df

In [None]:
results_df = results_df.join(tmp_df, on=['raceId', 'constructorId'])
results_df

In [None]:
tmp2_df = (
    results_df[[
        'equivalent_grid_constructor',
        'equivalent_position_constructor',
        'position',
        'grid'
    ]]
    .rolling(PREVIOUS_RESULTS_USED, min_periods=2)
    .agg(['mean', 'std'])
    .rename(columns={
        'position': 'position_driver',
        'grid': 'grid_driver'
    })
)
tmp2_df.columns = ['_'.join(col).strip() for col in tmp2_df.columns.values]
tmp2_df

In [None]:
tmp2_df.xs(1, level='driverId').head(20)

In [None]:
results_df = results_df.join(tmp2_df)
results_df

In [None]:
results_df.head(PREVIOUS_RESULTS_USED + 1)

In [None]:
results_df.columns

In [None]:
tt_df = results_df.drop(columns=[
    'raceId', 'constructorId', 'points', 'laps',
    'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
    'fastestLapSpeed', 'statusId', 'year', 'driverId',
    'driverId_shifted', 'year_shifted', 'equivalent_grid_constructor',
    'equivalent_position_constructor', 'fastestLapMillis'
])
x_columns = tt_df.columns.difference(['position'])
y_columns = ['position']
columns_in_order = list(x_columns) + y_columns
tt_df = tt_df[columns_in_order]
tt_df.shape

In [None]:
tt_df[tt_df['driverId_equals'] & tt_df['year_equals']]

In [None]:
tt_df.info()

In [None]:
tt_df.dropna().shape

In [None]:
tt_df[tt_df['driverId_equals'] & tt_df['year_equals']].dropna().shape

In [None]:
from sklearn.tree import DecisionTreeRegressor

tt_df = tt_df[tt_df['driverId_equals'] & tt_df['year_equals']].dropna()
tt_df = tt_df.drop(columns=['driverId_equals', 'year_equals'])
tt_size = tt_df.shape[0]
train_size = int(tt_size * 0.8)
train_df = tt_df.head(train_size)
test_df = tt_df.tail(tt_size - train_size)

x_columns = tt_df.columns.difference(['position'])
y_columns = ['position']
reg = DecisionTreeRegressor()
reg.fit(train_df[x_columns], train_df[y_columns])

In [None]:
test_df['predicted_position'] = reg.predict(test_df[x_columns])
test_df