In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

In [None]:

folder_path = "/content/drive/MyDrive/preprocessed_dataset/preprocessed_dataset"  # Update this to your actual folder path
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]


dataframes = {}
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df_name = file.replace(".csv", "")  
    dataframes[df_name] = pd.read_csv(file_path)
    print(f"Loaded: {file}") 

print("\nAvailable DataFrames:")
print(dataframes.keys())


Loaded: constructors_preprocessed.csv
Loaded: constructor_standings_preprocessed.csv
Loaded: seasons_preprocessed.csv
Loaded: status_preprocessed.csv
Loaded: qualifying_preprocessed.csv
Loaded: sprint_results_preprocessed.csv
Loaded: races_preprocessed.csv
Loaded: constructor_results_preprocessed.csv
Loaded: driver_standings_preprocessed.csv
Loaded: circuits_preprocessed.csv
Loaded: pit_stops_preprocessed.csv
Loaded: drivers_preprocessed.csv
Loaded: lap_times_preprocessed.csv
Loaded: results_preprocessed.csv

Available DataFrames:
dict_keys(['constructors_preprocessed', 'constructor_standings_preprocessed', 'seasons_preprocessed', 'status_preprocessed', 'qualifying_preprocessed', 'sprint_results_preprocessed', 'races_preprocessed', 'constructor_results_preprocessed', 'driver_standings_preprocessed', 'circuits_preprocessed', 'pit_stops_preprocessed', 'drivers_preprocessed', 'lap_times_preprocessed', 'results_preprocessed'])


In [None]:

results = dataframes.get('results_preprocessed')
qualifying = dataframes.get('qualifying_preprocessed') 


driver_finishing_pos = results.groupby("driverId")["positionOrder"].mean().reset_index()
driver_finishing_pos.columns = ["driverId", "AvgFinishingPosition"]


driver_qualifying_pos = qualifying.groupby("driverId")["position"].mean().reset_index()
driver_qualifying_pos.columns = ["driverId", "AvgQualifyingPosition"]


driver_consistency = driver_finishing_pos.merge(driver_qualifying_pos, on="driverId", how="left")

driver_consistency["NormFinishing"] = (driver_consistency["AvgFinishingPosition"] - driver_consistency["AvgFinishingPosition"].min()) / (driver_consistency["AvgFinishingPosition"].max() - driver_consistency["AvgFinishingPosition"].min())
driver_consistency["NormQualifying"] = (driver_consistency["AvgQualifyingPosition"] - driver_consistency["AvgQualifyingPosition"].min()) / (driver_consistency["AvgQualifyingPosition"].max() - driver_consistency["AvgQualifyingPosition"].min())


driver_consistency["DriverConsistency"] = 0.7 * driver_consistency["NormFinishing"] + 0.3 * driver_consistency["NormQualifying"]
print(driver_consistency.head())

   driverId  AvgFinishingPosition  AvgQualifyingPosition  NormFinishing  \
0         1              5.019663               4.073034       0.081982   
1         2             10.722826              11.100000       0.236819   
2         3              8.252427               6.834951       0.169749   
3         4              8.492574               8.139535       0.176269   
4         5             13.285714              13.883929       0.306399   

   NormQualifying  DriverConsistency  
0        0.120511           0.093541  
1        0.396078           0.284597  
2        0.228822           0.187471  
3        0.279982           0.207383  
4        0.505252           0.366055  


In [None]:
constructor_standings = dataframes.get('constructor_standings_preprocessed') 
status = dataframes.get('status_preprocessed') 

constructor_avg_points = constructor_standings.groupby("constructorId")["points"].mean().reset_index()
constructor_avg_points.columns = ["constructorId", "AvgConstructorPoints"]

results_with_status = results.merge(status, on="statusId", how="left")
constructor_failures = results_with_status[results_with_status["status"].str.contains("DNF|Crash|Engine|Retired", na=False)]

# Failure rate calculation
constructor_reliability = constructor_failures.groupby("constructorId").size().reset_index(name="Failures")
total_races = results.groupby("constructorId").size().reset_index(name="TotalRaces")

# Merge failures with total races and calculate reliability score
team_reliability = total_races.merge(constructor_reliability, on="constructorId", how="left").fillna(0)
team_reliability["ReliabilityScore"] = 1 - (team_reliability["Failures"] / team_reliability["TotalRaces"])

team_strength = constructor_avg_points.merge(team_reliability[["constructorId", "ReliabilityScore"]], on="constructorId", how="left")

team_strength["TeamStrength"] = 0.8 * team_strength["AvgConstructorPoints"] + 0.2 * team_strength["ReliabilityScore"]

print(team_strength.head())

   constructorId  AvgConstructorPoints  ReliabilityScore  TeamStrength
0              1             72.732508          0.931877     58.372382
1              2             41.885714          0.971429     33.702857
2              3             37.981914          0.943317     30.574194
3              4             42.285714          0.928844     34.014340
4              5             19.376866          0.955224     15.692537


In [None]:

results = dataframes.get('results_preprocessed') 
qualifying = dataframes.get('qualifying_preprocessed')
circuits = dataframes.get('circuits_preprocessed') 

if 'circuitId' not in results.columns:
   
    races = dataframes.get('races_preprocessed')
    if races is not None and 'circuitId' in races.columns:
        results = pd.merge(results, races[['raceId', 'circuitId']], on='raceId', how='left')
    else:
        raise KeyError("Unable to automatically add 'circuitId' column. 'races_preprocessed' dataframe or 'circuitId' column within it is missing.")

# Compute Average Overtakes per Circuit
results["Overtakes"] = results["grid"] - results["positionOrder"]
overtakes_per_circuit = results.groupby("circuitId")["Overtakes"].mean().reset_index()
overtakes_per_circuit.columns = ["circuitId", "AvgOvertakes"]

altitude_factor = circuits[["circuitId", "alt"]].fillna(0)  
grid_position_changes = results.groupby("circuitId").apply(lambda x: (x["grid"] - x["positionOrder"]).mean()).reset_index(name="AvgGridPositionChange")
track_complexity = overtakes_per_circuit.merge(altitude_factor, on="circuitId", how="left").merge(grid_position_changes, on="circuitId", how="left")

# Normalize values
track_complexity["NormOvertakes"] = (track_complexity["AvgOvertakes"] - track_complexity["AvgOvertakes"].min()) / (track_complexity["AvgOvertakes"].max() - track_complexity["AvgOvertakes"].min())
track_complexity["NormAltitude"] = (track_complexity["alt"] - track_complexity["alt"].min()) / (track_complexity["alt"].max() - track_complexity["alt"].min())
track_complexity["NormGridChange"] = (track_complexity["AvgGridPositionChange"] - track_complexity["AvgGridPositionChange"].min()) / (track_complexity["AvgGridPositionChange"].max() - track_complexity["AvgGridPositionChange"].min())

# Compute final Track Complexity Score
track_complexity["TrackComplexity"] = 0.6 * track_complexity["NormOvertakes"] + 0.3 * track_complexity["NormAltitude"] + 0.1 * track_complexity["NormGridChange"]
print(track_complexity.head())

   circuitId  AvgOvertakes  alt  AvgGridPositionChange  NormOvertakes  \
0          1     -0.287695   10              -0.287695       0.967362   
1          2     -0.053398   18              -0.053398       0.993942   
2          3     -0.025000    7              -0.025000       0.997164   
3          4     -0.632411  109              -0.632411       0.928256   
4          5     -0.197917  130              -0.197917       0.977547   

   NormAltitude  NormGridChange  TrackComplexity  
0      0.007610        0.967362         0.679437  
1      0.011191        0.993942         0.699117  
2      0.006267        0.997164         0.699895  
3      0.051925        0.928256         0.665357  
4      0.061325        0.977547         0.702681  


  grid_position_changes = results.groupby("circuitId").apply(lambda x: (x["grid"] - x["positionOrder"]).mean()).reset_index(name="AvgGridPositionChange")


In [None]:

if 'circuitId' not in driver_consistency.columns:
    driver_consistency = driver_consistency.merge(results[['driverId', 'raceId']], on='driverId', how='left')
    driver_consistency = driver_consistency.merge(races[['raceId', 'circuitId']], on='raceId', how='left')

final_features = (
    driver_consistency[["driverId", "DriverConsistency", "constructorId", "circuitId"]]
    .merge(team_strength[["constructorId", "TeamStrength"]], on="constructorId", how="left")
    .merge(track_complexity[["circuitId", "TrackComplexity"]], on="circuitId", how="left")
)

final_features.to_csv("engineered_features.csv", index=False)

Driver Consistency Columns: Index(['driverId', 'AvgFinishingPosition', 'AvgQualifyingPosition',
       'NormFinishing', 'NormQualifying', 'DriverConsistency',
       'constructorId'],
      dtype='object')
Team Strength Columns: Index(['constructorId', 'AvgConstructorPoints', 'ReliabilityScore',
       'TeamStrength'],
      dtype='object')
Track Complexity Columns: Index(['circuitId', 'AvgOvertakes', 'alt', 'AvgGridPositionChange',
       'NormOvertakes', 'NormAltitude', 'NormGridChange', 'TrackComplexity'],
      dtype='object')
'circuitId' is missing in driver_consistency. Fixing it...
Feature engineering completed. Data saved to 'engineered_features.csv'.


In [None]:

engineered_features = pd.read_csv("engineered_features.csv")
engineered_features["TeamStrength"] = engineered_features["TeamStrength"].fillna(0)

min_val = engineered_features["TeamStrength"].min()
max_val = engineered_features["TeamStrength"].max()
epsilon = 1e-6

engineered_features["TeamStrength"] = (engineered_features["TeamStrength"] - min_val) / (max_val - min_val + epsilon)
engineered_features["TeamStrength"] = engineered_features["TeamStrength"].apply(lambda x: min(round(x, 2), 0.99))
engineered_features["DriverConsistency"] = engineered_features["DriverConsistency"].apply(lambda x: min(round(x, 2), 0.99))
engineered_features["TrackComplexity"] = engineered_features["TrackComplexity"].apply(lambda x: min(round(x, 2), 0.99))
engineered_features.to_csv("Normalized_engineered_features.csv", index=False)

Normalization completed. Data saved to 'Normalized_engineered_features.csv'.
