In [2]:
import pandas as pd

# Load all datasets
circuits = pd.read_csv('historic_data/circuits.csv')
constructor_results = pd.read_csv('historic_data/constructor_results.csv')
constructor_standings = pd.read_csv('historic_data/constructor_standings.csv')
constructors = pd.read_csv('historic_data/constructors.csv')
driver_standings = pd.read_csv('historic_data/driver_standings.csv')
drivers = pd.read_csv('historic_data/drivers.csv')
pit_stops = pd.read_csv('historic_data/pit_stops.csv')
qualifying = pd.read_csv('historic_data/qualifying.csv')
races = pd.read_csv('historic_data/races.csv')
results = pd.read_csv('historic_data/results.csv')
seasons = pd.read_csv('historic_data/seasons.csv')
sprint_results = pd.read_csv('historic_data/sprint_results.csv')
status = pd.read_csv('historic_data/status.csv')

In [9]:
# Merge datasets to create a unified dataset
merged_data = pd.merge(results, drivers, on='driverId', suffixes=('_result', '_driver'))
merged_data = pd.merge(merged_data, constructors, on='constructorId', suffixes=('', '_constructor'))
merged_data = pd.merge(merged_data, races, on='raceId', suffixes=('', '_race'))
merged_data = pd.merge(merged_data, circuits, on='circuitId', suffixes=('', '_circuit'))
merged_data = pd.merge(merged_data, constructor_standings, on=['constructorId', 'raceId'], how='left', suffixes=('', '_constructor_standing'))
merged_data = pd.merge(merged_data, driver_standings, on=['driverId', 'raceId'], how='left', suffixes=('', '_driver_standing'))
merged_data = pd.merge(merged_data, qualifying, on=['driverId', 'raceId'], how='left', suffixes=('', '_qualifying'))
merged_data = pd.merge(merged_data, sprint_results, on=['driverId', 'raceId'], how='left', suffixes=('', '_sprint'))


In [12]:
# Fill missing values with a placeholder (e.g., -1 or 'Unknown')
merged_data = merged_data.fillna({
    'positionOrder': -1,
    'points': 0,
    'laps_sprint': 0,
    'time_sprint': 'Unknown',
    # Add other columns as needed
})

# Display columns with missing values
missing_columns = merged_data.columns[merged_data.isnull().any()]
print("Columns with missing values:", missing_columns)

# Fill missing values with appropriate placeholders
merged_data = merged_data.fillna({
    'constructorStandingsId': 0,
    'points_constructor_standing': 0,
    'position_constructor_standing': 'Unknown',
    'positionText_constructor_standing': 'Unknown',
    'wins': 0,
    'driverStandingsId': 0,
    'points_driver_standing': 0,
    'position_driver_standing': 'Unknown',
    'positionText_driver_standing': 'Unknown',
    'wins_driver_standing': 0,
    'qualifyId': 0,
    'constructorId_qualifying': 0,
    'number': 0,
    'position_qualifying': 'Unknown',
    'q1': 'Unknown',
    'q2': 'Unknown',
    'q3': 'Unknown',
    'resultId_sprint': 0,
    'constructorId_sprint': 0,
    'number_sprint': 0,
    'grid_sprint': 0,
    'position_sprint': 'Unknown',
    'positionText_sprint': 'Unknown',
    'positionOrder_sprint': 0,
    'points_sprint': 0,
    'milliseconds_sprint': 0,
    'fastestLap_sprint': 0,
    'fastestLapTime_sprint': 'Unknown',
    'statusId_sprint': 0
})

# Check if there are any remaining missing values
remaining_missing = merged_data.isnull().sum()
print("Remaining missing values:", remaining_missing[remaining_missing > 0])


Columns with missing values: Index(['constructorStandingsId', 'points_constructor_standing',
       'position_constructor_standing', 'positionText_constructor_standing',
       'wins', 'driverStandingsId', 'points_driver_standing',
       'position_driver_standing', 'positionText_driver_standing',
       'wins_driver_standing', 'qualifyId', 'constructorId_qualifying',
       'number', 'position_qualifying', 'q1', 'q2', 'q3', 'resultId_sprint',
       'constructorId_sprint', 'number_sprint', 'grid_sprint',
       'position_sprint', 'positionText_sprint', 'positionOrder_sprint',
       'points_sprint', 'milliseconds_sprint', 'fastestLap_sprint',
       'fastestLapTime_sprint', 'statusId_sprint'],
      dtype='object')
Remaining missing values: Series([], dtype: int64)


In [18]:
# Add average win rate
win_counts = merged_data[merged_data['positionOrder'] == 1].groupby('driverId').size()
race_counts = merged_data.groupby('driverId').size()
merged_data['avg_win_rate'] = merged_data['driverId'].map(win_counts / race_counts).fillna(0)

# Add average podium rate
podium_counts = merged_data[merged_data['positionOrder'] <= 3].groupby('driverId').size()
merged_data['avg_podium_rate'] = merged_data['driverId'].map(podium_counts / race_counts).fillna(0)

# Add average constructor win rate
constructor_win_counts = merged_data[merged_data['positionOrder'] == 1].groupby('constructorId').size()
constructor_race_counts = merged_data.groupby('constructorId').size()
merged_data['avg_constructor_win_rate'] = merged_data['constructorId'].map(constructor_win_counts / constructor_race_counts).fillna(0)

# Add average constructor podium rate
constructor_podium_counts = merged_data[merged_data['positionOrder'] <= 3].groupby('constructorId').size()
merged_data['avg_constructor_podium_rate'] = merged_data['constructorId'].map(constructor_podium_counts / constructor_race_counts).fillna(0)

# Add driver experience (number of races participated in)
driver_experience = merged_data.groupby('driverId').size()
merged_data['driver_experience'] = merged_data['driverId'].map(driver_experience)

# Add constructor experience (number of races participated in)
constructor_experience = merged_data.groupby('constructorId').size()
merged_data['constructor_experience'] = merged_data['constructorId'].map(constructor_experience)

# Calculate recent form (average points in the last 5 races)
merged_data['recent_form'] = merged_data.groupby('driverId')['points'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

# Example circuit features, replace with actual data
circuit_features = {
    'circuitId': [1, 2, 3],  # Example circuit IDs
    'circuit_type': ['Permanent', 'Street', 'Permanent'],
    'circuit_length': [5.3, 3.5, 4.2]  # Example circuit lengths in kilometers
}

circuit_features_df = pd.DataFrame(circuit_features)

# Merge circuit features into merged_data
merged_data = pd.merge(merged_data, circuit_features_df, on='circuitId', how='left')

# Calculate average number of pit stops
avg_pit_stops = pit_stops.groupby(['raceId', 'driverId']).size().groupby('driverId').mean()
merged_data['avg_pit_stops'] = merged_data['driverId'].map(avg_pit_stops).fillna(0)

# Calculate average duration of pit stops
avg_pit_stop_duration = pit_stops.groupby(['raceId', 'driverId'])['milliseconds'].mean().groupby('driverId').mean()
merged_data['avg_pit_stop_duration'] = merged_data['driverId'].map(avg_pit_stop_duration).fillna(0)

# Calculate average qualifying position
avg_qualifying_position = qualifying.groupby('driverId')['position'].mean()
merged_data['avg_qualifying_position'] = merged_data['driverId'].map(avg_qualifying_position).fillna(0)

# Display the first few rows of the final dataset
print(merged_data.head())

# Save the final dataset to a CSV file
# merged_data.to_csv('/mnt/data/final_dataset.csv', index=False)

   resultId  raceId  driverId  constructorId number_result  grid position  \
0         1      18         1              1            22     1        1   
1         2      18         2              2             3     5        2   
2         3      18         3              3             7     7        3   
3         4      18         4              4             5    11        4   
4         5      18         5              1            23     3        5   

  positionText  positionOrder  points  ...  driver_experience  \
0            1              1    10.0  ...                322   
1            2              2     8.0  ...                184   
2            3              3     6.0  ...                206   
3            4              4     5.0  ...                370   
4            5              5     4.0  ...                112   

  constructor_experience recent_form circuit_type_x circuit_length_x  \
0                   1855        10.0      Permanent              5.3   
1 