# 1. Imports

In [6]:
from src.configuration import Configuration
from src.fetch_data import DataFetcher
from aggregate_data import DataAggregator
from src.dim import Dims
from src.facts import Facts
from src.dicts import Dicts
import pandas as pd
import time

# 2. Variables

In [7]:
config = Configuration('src/config.cfg')
fetcher = DataFetcher(config)
DATAAGGREGATOR = DataAggregator(config, fetcher)
DIMS = Dims(config, fetcher)
DICTS = Dicts(config, fetcher)
FACTS = Facts(config, fetcher)

pd.set_option('display.max_columns', None)

# 3. Data

## 3.1 Driver related data

In [8]:
driver_last_5_races_result = DATAAGGREGATOR.get_last_races_result(5,"Race", "position")
time.sleep(5)
driver_last_5_quali_result = DATAAGGREGATOR.get_last_races_result(5,"Qualifying", "position")
time.sleep(5)
driver_last_5_avg_race_position = DATAAGGREGATOR.get_last_races_result(5, "Race", "avg")
time.sleep(5)
driver_last_5_std_race_position = DATAAGGREGATOR.get_last_races_result(5, "Race", "std")
time.sleep(5)
driver_last_5_avg_quali_position = DATAAGGREGATOR.get_last_races_result(5, "Qualifying", "avg")
time.sleep(5)
driver_last_5_std_quali_position = DATAAGGREGATOR.get_last_races_result(5, "Qualifying", "std")
time.sleep(5)
driver_points_gathered = DATAAGGREGATOR.get_racer_team_points("driver")
time.sleep(10)
driver_gap_to_teammate = DATAAGGREGATOR.calculate_gap_to_teammate()
time.sleep(5)
driver_gap_to_leader = DATAAGGREGATOR.calculate_gap_to_leader()
dict_world_champions = DICTS.dict_world_champions()
time.sleep(10)
pit_stops_efficiency = DATAAGGREGATOR.calculate_pit_stop_efficiency()
time.sleep(10)
driver_race_wins_total = DATAAGGREGATOR.calculate_total_wins("Race")
time.sleep(10)
driver_quali_wins_total = DATAAGGREGATOR.calculate_total_wins("Qualifying")
time.sleep(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_filtered["points_gap_to_teammate"] = merged_filtered["points_gained"] - merged_filtered["points_gained_other"]


## 3.2 Team related data

In [9]:
team_points_gathered = DATAAGGREGATOR.get_racer_team_points("team")
time.sleep(5)
team_gap_to_leader = DATAAGGREGATOR.calculate_gap_to_best_team()
time.sleep(10)

## 3.3 Session Related Data

In [10]:
dim_session = DIMS.dim_sessions()
dim_session_race = dim_session[dim_session["session_name"]=="Race"]
needed_cols = ["key", "location", "year", "is_current_season","country_name", "date_start", "session_key"]
dim_session_race = dim_session_race[needed_cols]
race_sequence = DATAAGGREGATOR.calculate_race_sequence_number()

In [11]:
needed_cols = ["driver_number","key","number_of_laps","dnf","dns","dsq","duration"]
fact_session_result = FACTS.fact_session_results()
race_duration = fact_session_result.merge(
    dim_session_race,
    on = "session_key",
    how="inner"
)

race_duration = race_duration[needed_cols]
race_duration["duration"] = race_duration["duration"].fillna(9999)

  race_duration["duration"] = race_duration["duration"].fillna(9999)


## 3.4 Weather Data

In [12]:
# dim_session_quali = dim_session[dim_session["session_name"]=="Qualifying"]
# needed_cols = ["key", "location", "year", "is_current_season","country_name", "date_start", "session_key"]
# dim_session_quali = dim_session_quali[needed_cols]
# dim_weather_race = DIMS.dim_weather(race_type="Race")
# time.sleep(20)
# dim_weather_quali = DIMS.dim_weather(race_type="Qualifying")

### 3.4.1 Race Weather

IMPORTANT NOTE!!!
Treat this data as a weather forecast. Since this data is gathered after the race - we couldn't use to predict a winner of a particular GP. So for sake of this project we will allow this small data leakage.

In [13]:
# race_weather = dim_weather_race.merge(
#     dim_session_race,
#     on="session_key",
#     how="inner"
# )
# cols_race = [col for col in race_weather.columns if "race" in col or col == "key"]
# race_weather = race_weather[cols_race]

### 3.5 Qualification Weather

In [14]:
# quali_weather = dim_weather_quali.merge(
#     dim_session_quali,
#     on="session_key",
#     how="inner"
# )
# cols_quali = [col for col in quali_weather.columns if "quali" in col or col == "key"]
# quali_weather = quali_weather[cols_quali]
# quali_weather

# 3.5 Joined Data

In [15]:
df = (
    dim_session
    .merge(driver_last_5_races_result, on="key", how="inner")
    .merge(driver_last_5_quali_result, on=["key", "driver_number"], how="inner", suffixes=["_race", "_quali"])
    .merge(driver_last_5_avg_race_position, on=["driver_number", "key"])
    .merge(driver_last_5_std_race_position, on=["driver_number", "key"])
    .merge(driver_last_5_avg_quali_position, on=["driver_number", "key"])
    .merge(driver_last_5_std_quali_position, on=["driver_number", "key"])
    .merge(driver_points_gathered[["driver_number", "key", "points_gained"]], on=["driver_number", "key"])
    .merge(driver_gap_to_teammate, on=["driver_number", "key"])
    .merge(driver_gap_to_leader, on=["driver_number", "key"])
    .merge(driver_race_wins_total, on = ["driver_number", "key"])
    .merge(driver_quali_wins_total, on = ['driver_number', 'key'])
    .merge(team_points_gathered, on=["driver_number", "key"])
    .merge(team_gap_to_leader, on=["driver_number", "key"])
    .merge(dict_world_champions,on="driver_number",how="left")
    .fillna({"titles_count": 0})
    .assign(world_champion=lambda x: x["titles_count"].astype(int))
    .merge(race_duration, on=["driver_number","key"])
    .merge(pit_stops_efficiency, on=["driver_number","key"])
    # .merge(race_weather, on="key") # Long loading time - off for now 
    # .merge(quali_weather, on="key") # Long loading time - off for now 
    .merge(race_sequence, on="key")
)


In [16]:
df

Unnamed: 0,session_key,location,date_start,date_end,session_name,country_code,country_name,year,is_current_season,key,driver_number,position_race,last_race_pos_1,last_race_pos_2,last_race_pos_3,last_race_pos_4,last_race_pos_5,position_quali,last_qualifying_pos_1,last_qualifying_pos_2,last_qualifying_pos_3,last_qualifying_pos_4,last_qualifying_pos_5,avg_last_5_race,std_last_5_race,avg_last_5_qualifying,std_last_5_qualifying,points_gained,points_gap_to_teammate,gap_to_leader,wins_before_session_Race,wins_before_session_Qualifying,team_name,team_points_gained,gap_to_best_team,titles_count,world_champion,number_of_laps,dnf,dns,dsq,duration,last_5_races_median_pit_stop_time,race_number
0,9098,Barcelona,2023-06-03,2023-06-03,Qualifying,ESP,Spain,2023,0,Barcelona2023,1,1.0,2.0,1.0,2.0,1.0,1.0,1.0,15.0,1.0,2.0,9.0,1.0,1.4,0.489898,5.6,5.571355,145.0,36.0,0.0,4,3,Red Bull Racing,254.0,0.0,4.0,4,66.0,False,False,False,5277.940,23.3,7
1,9098,Barcelona,2023-06-03,2023-06-03,Qualifying,ESP,Spain,2023,0,Barcelona2023,2,20.0,16.0,16.0,16.0,20.0,18.0,20.0,21.0,18.0,15.0,20.0,16.0,17.2,1.600000,18.0,2.280351,0.0,-1.0,145.0,0,0,Williams,1.0,253.0,0.0,0,65.0,False,False,False,9999.000,23.3,7
2,9098,Barcelona,2023-06-03,2023-06-03,Qualifying,ESP,Spain,2023,0,Barcelona2023,4,17.0,17.0,6.0,9.0,17.0,9.0,3.0,19.0,13.0,7.0,16.0,10.0,11.6,4.543127,13.0,4.242641,12.0,7.0,133.0,0,0,McLaren,17.0,237.0,0.0,0,65.0,False,False,False,9999.000,23.3,7
3,9098,Barcelona,2023-06-03,2023-06-03,Qualifying,ESP,Spain,2023,0,Barcelona2023,10,10.0,9.0,13.0,14.0,8.0,7.0,4.0,10.0,9.0,19.0,5.0,7.0,10.2,2.785678,10.0,4.816638,15.0,-10.0,130.0,0,0,Alpine,40.0,214.0,0.0,0,66.0,False,False,False,5351.470,23.3,7
4,9098,Barcelona,2023-06-03,2023-06-03,Qualifying,ESP,Spain,2023,0,Barcelona2023,11,4.0,1.0,5.0,1.0,2.0,16.0,11.0,1.0,21.0,3.0,1.0,20.0,5.0,5.692100,9.2,9.260670,109.0,-36.0,36.0,2,2,Red Bull Racing,254.0,0.0,0.0,0,66.0,False,False,False,5313.752,23.3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2705,9869,São Paulo,2025-11-09,2025-11-09,Race,BRA,Brazil,2025,1,São Paulo2025,44,21.0,6.0,8.0,8.0,4.0,8.0,13.0,5.0,12.0,6.0,5.0,3.0,6.8,1.600000,6.2,3.059412,127.0,-70.0,240.0,2,1,Ferrari,324.0,388.0,7.0,7,37.0,True,False,False,9999.000,23.3,21
2706,9869,São Paulo,2025-11-09,2025-11-09,Race,BRA,Brazil,2025,1,São Paulo2025,55,13.0,11.0,3.0,10.0,21.0,17.0,15.0,13.0,2.0,21.0,9.0,7.0,12.4,6.183850,10.4,6.374951,29.0,-41.0,338.0,3,3,Williams,99.0,613.0,0.0,0,71.0,False,False,False,5577.016,22.5,21
2707,9869,São Paulo,2025-11-09,2025-11-09,Race,BRA,Brazil,2025,1,São Paulo2025,63,4.0,5.0,2.0,1.0,6.0,7.0,6.0,6.0,5.0,1.0,4.0,4.0,4.2,2.315167,4.0,1.673320,253.0,143.0,114.0,4,5,Mercedes,363.0,349.0,0.0,0,71.0,False,False,False,5536.863,23.5,21
2708,9869,São Paulo,2025-11-09,2025-11-09,Race,BRA,Brazil,2025,1,São Paulo2025,81,5.0,3.0,21.0,4.0,5.0,5.0,4.0,3.0,9.0,3.0,6.0,8.0,7.6,6.740920,5.8,2.481935,345.0,-22.0,22.0,9,5,McLaren,712.0,0.0,0.0,0,71.0,False,False,False,5537.345,23.4,21


In [17]:
df.columns

Index(['session_key', 'location', 'date_start', 'date_end', 'session_name',
       'country_code', 'country_name', 'year', 'is_current_season', 'key',
       'driver_number', 'position_race', 'last_race_pos_1', 'last_race_pos_2',
       'last_race_pos_3', 'last_race_pos_4', 'last_race_pos_5',
       'position_quali', 'last_qualifying_pos_1', 'last_qualifying_pos_2',
       'last_qualifying_pos_3', 'last_qualifying_pos_4',
       'last_qualifying_pos_5', 'avg_last_5_race', 'std_last_5_race',
       'avg_last_5_qualifying', 'std_last_5_qualifying', 'points_gained',
       'points_gap_to_teammate', 'gap_to_leader', 'wins_before_session_Race',
       'wins_before_session_Qualifying', 'team_name', 'team_points_gained',
       'gap_to_best_team', 'titles_count', 'world_champion', 'number_of_laps',
       'dnf', 'dns', 'dsq', 'duration', 'last_5_races_median_pit_stop_time',
       'race_number'],
      dtype='object')

In [18]:
len(df.columns)

44