In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set visualization styles
sns.set(style="whitegrid")

# F1 Race Prediction: Exploratory Data Analysis (EDA)

This notebook explores historical Formula 1 data to uncover patterns, correlations, and trends relevant for race outcome prediction. Analyses include finishing position distributions, qualifying vs race results, DNF rates, performance trends, home advantage, weather and safety car impacts.

In [None]:
# Load Data from SQLite Database
import sqlite3

db_path = '../data/processed/f1_data.sqlite'
conn = sqlite3.connect(db_path)
races = pd.read_sql_query('SELECT * FROM races', conn)
race_results = pd.read_sql_query('SELECT * FROM race_results', conn)
qualifying_results = pd.read_sql_query('SELECT * FROM qualifying_results', conn)
conn.close()

print(f"Races: {races.shape}")
print(f"Race Results: {race_results.shape}")
print(f"Qualifying Results: {qualifying_results.shape}")

## Distribution of Finishing Positions by Driver/Team

Analyze how drivers and teams finish across races and seasons.

In [None]:
# Plot Finishing Position Distributions

top_drivers = race_results['driver_id'].value_counts().head(10).index
plt.figure(figsize=(12, 6))
for driver in top_drivers:
    driver_results = race_results[race_results['driver_id'] == driver]
    sns.histplot(driver_results['final_position'], label=driver, kde=True, bins=20, alpha=0.5)
plt.title('Finishing Position Distribution: Top 10 Drivers')
plt.xlabel('Finishing Position')
plt.ylabel('Count')
plt.legend()
plt.show()

# Team distribution
plt.figure(figsize=(12, 6))
top_teams = race_results['constructor_id'].value_counts().head(5).index
for team in top_teams:
    team_results = race_results[race_results['constructor_id'] == team]
    sns.histplot(team_results['final_position'], label=team, kde=True, bins=20, alpha=0.5)
plt.title('Finishing Position Distribution: Top 5 Teams')
plt.xlabel('Finishing Position')
plt.ylabel('Count')
plt.legend()
plt.show()

## Correlation Between Qualifying and Race Positions

Explore the relationship between qualifying position and final race result.

In [None]:
# Correlation Analysis: Qualifying vs Race Position
merged = pd.merge(race_results, qualifying_results, on=["race_id", "driver_id", "constructor_id"], suffixes=("_race", "_qual"))
plt.figure(figsize=(8, 6))
sns.scatterplot(x=merged['position'], y=merged['final_position'])
plt.title('Qualifying Position vs Final Race Position')
plt.xlabel('Qualifying Position')
plt.ylabel('Final Race Position')
plt.show()

corr = merged[['position', 'final_position']].corr().iloc[0,1]
print(f"Correlation between qualifying and race position: {corr:.2f}")

## Impact of Starting Position on Final Results

Analyze how grid position affects race outcomes.

In [None]:
# Impact of Grid Position on Final Results
plt.figure(figsize=(10, 6))
sns.boxplot(x='grid_position', y='final_position', data=race_results)
plt.title('Grid Position vs Final Race Position')
plt.xlabel('Grid Position')
plt.ylabel('Final Race Position')
plt.show()

## DNF Rates by Team and Engine Manufacturer

Analyze the frequency of DNFs (Did Not Finish) by team and engine supplier.

In [None]:
# DNF Rates by Team
race_results['dnf'] = race_results['status'].str.contains('DNF|Retired|Accident|Mechanical', case=False)
dnf_by_team = race_results.groupby('constructor_id')['dnf'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
dnf_by_team.plot(kind='bar', color='red')
plt.title('DNF Rate by Team')
plt.ylabel('DNF Rate')
plt.xlabel('Team')
plt.show()

# DNF Rates by Engine Manufacturer (if available)
if 'engine_manufacturer' in race_results.columns:
    dnf_by_engine = race_results.groupby('engine_manufacturer')['dnf'].mean().sort_values(ascending=False)
    plt.figure(figsize=(10, 5))
    dnf_by_engine.plot(kind='bar', color='orange')
    plt.title('DNF Rate by Engine Manufacturer')
    plt.ylabel('DNF Rate')
    plt.xlabel('Engine Manufacturer')
    plt.show()

## Performance Trends Over Seasons

Visualize how driver and team performance evolves across multiple seasons.

In [None]:
# Team Performance Trends Over Seasons
team_points = race_results.groupby(['season', 'constructor_id'])['points'].sum().reset_index()
plt.figure(figsize=(14, 7))
for team in team_points['constructor_id'].unique():
    team_data = team_points[team_points['constructor_id'] == team]
    plt.plot(team_data['season'], team_data['points'], label=team)
plt.title('Team Performance Evolution Over Seasons')
plt.xlabel('Season')
plt.ylabel('Total Points')
plt.legend()
plt.show()

## Home Advantage Analysis

Analyze if drivers perform better at their home Grand Prix.

In [None]:
# Home Advantage Analysis
# Assumes races and race_results have 'country' and 'driver_nationality' columns
if 'country' in races.columns and 'driver_nationality' in race_results.columns:
    merged_home = pd.merge(race_results, races[['race_id', 'country']], on='race_id')
    merged_home['is_home'] = merged_home['country'] == merged_home['driver_nationality']
    home_perf = merged_home.groupby('is_home')['final_position'].mean()
    print('Average finishing position (home vs non-home):')
    print(home_perf)
    sns.boxplot(x='is_home', y='final_position', data=merged_home)
    plt.title('Home Advantage: Finishing Position')
    plt.xlabel('Home Race')
    plt.ylabel('Finishing Position')
    plt.show()

## Weather Impact on Race Outcomes

Analyze how different weather conditions affect race results and DNFs.

In [None]:
# Weather Impact Analysis
if 'weather_conditions' in races.columns:
    merged_weather = pd.merge(race_results, races[['race_id', 'weather_conditions']], on='race_id')
    weather_dnf = merged_weather.groupby('weather_conditions')['dnf'].mean().sort_values(ascending=False)
    plt.figure(figsize=(10, 5))
    weather_dnf.plot(kind='bar', color='blue')
    plt.title('DNF Rate by Weather Condition')
    plt.ylabel('DNF Rate')
    plt.xlabel('Weather Condition')
    plt.show()
    # Average finishing position by weather
    weather_finish = merged_weather.groupby('weather_conditions')['final_position'].mean().sort_values()
    plt.figure(figsize=(10, 5))
    weather_finish.plot(kind='bar', color='green')
    plt.title('Average Finishing Position by Weather Condition')
    plt.ylabel('Average Finishing Position')
    plt.xlabel('Weather Condition')
    plt.show()

## Safety Car Impact on Race Results

Analyze how safety car deployments affect race outcomes and position changes.

In [None]:
# Safety Car Impact Analysis
if 'safety_cars' in races.columns:
    merged_safety = pd.merge(race_results, races[['race_id', 'safety_cars']], on='race_id')
    safety_finish = merged_safety.groupby('safety_cars')['final_position'].mean()
    plt.figure(figsize=(10, 5))
    safety_finish.plot(kind='bar', color='purple')
    plt.title('Average Finishing Position by Number of Safety Cars')
    plt.ylabel('Average Finishing Position')
    plt.xlabel('Number of Safety Cars')
    plt.show()

## Advanced Visualizations

Heatmaps, scatter plots, box plots, and Sankey diagrams to explore F1 data relationships.

In [None]:
# Heatmap: Driver Performance by Circuit
pivot = race_results.pivot_table(index='driver_id', columns='circuit_id', values='final_position', aggfunc='mean')
plt.figure(figsize=(16, 8))
sns.heatmap(pivot, cmap='YlGnBu', annot=False)
plt.title('Driver Performance by Circuit (Average Finishing Position)')
plt.xlabel('Circuit')
plt.ylabel('Driver')
plt.show()

In [None]:
# Interactive Time Series: Team Performance Evolution
import plotly.express as px
fig = px.line(team_points, x='season', y='points', color='constructor_id', title='Team Performance Evolution Over Seasons')
fig.show()

In [None]:
# Scatter Plot: Qualifying vs Race Pace
fig = px.scatter(merged, x='position', y='final_position', color='driver_id', title='Qualifying vs Race Position')
fig.show()

In [None]:
# Box Plots: Consistency Metrics
consistency = race_results.groupby('driver_id')['final_position'].std().sort_values()
plt.figure(figsize=(12, 6))
consistency.plot(kind='box')
plt.title('Driver Consistency (Std Dev of Finishing Position)')
plt.ylabel('Standard Deviation')
plt.show()

In [None]:
# Sankey Diagram: Position Changes
import plotly.graph_objects as go
# Example: Use merged qualifying and race positions for a single race
sample_race = merged[merged['race_id'] == merged['race_id'].iloc[0]]
labels = [f"Q{q}" for q in sample_race['position']] + [f"R{r}" for r in sample_race['final_position']]
source = list(range(len(sample_race['position'])))
target = list(range(len(sample_race['position']), len(sample_race['position'])*2))
value = [1]*len(sample_race['position'])
fig = go.Figure(data=[go.Sankey(
    node=dict(label=labels),
    link=dict(source=source, target=target, value=value)
)])
fig.update_layout(title_text="Position Changes: Qualifying to Race", font_size=10)
fig.show()

## Statistical Testing

Test for significant differences between teammates, track effects, momentum, and betting market efficiency.

In [None]:
# Teammate Comparison: Paired t-test
from scipy.stats import ttest_rel
# Example: Compare two teammates' finishing positions
team = 'red_bull'
teammates = race_results[race_results['constructor_id'] == team]['driver_id'].unique()
if len(teammates) == 2:
    d1 = race_results[(race_results['constructor_id'] == team) & (race_results['driver_id'] == teammates[0])]['final_position']
    d2 = race_results[(race_results['constructor_id'] == team) & (race_results['driver_id'] == teammates[1])]['final_position']
    t_stat, p_val = ttest_rel(d1, d2)
    print(f"Teammate comparison ({teammates[0]} vs {teammates[1]}): t={t_stat:.2f}, p={p_val:.3f}")

In [None]:
# Track Effect: ANOVA
from scipy.stats import f_oneway
track_groups = [group['final_position'].values for name, group in race_results.groupby('circuit_id') if len(group) > 10]
if len(track_groups) > 1:
    f_stat, p_val = f_oneway(*track_groups)
    print(f"Track effect ANOVA: F={f_stat:.2f}, p={p_val:.3f}")

In [None]:
# Momentum Effect: Hot Streaks
race_results['prev_position'] = race_results.groupby('driver_id')['final_position'].shift(1)
race_results['momentum'] = race_results['final_position'] < race_results['prev_position']
momentum_rate = race_results.groupby('driver_id')['momentum'].mean()
plt.figure(figsize=(12, 6))
momentum_rate.plot(kind='bar', color='teal')
plt.title('Driver Momentum Rate (Improving Position)')
plt.ylabel('Momentum Rate')
plt.xlabel('Driver')
plt.show()

In [None]:
# Betting Market Efficiency (Example)
# If betting odds data is available, compare predicted probabilities to actual outcomes
# odds_df = pd.read_csv('../data/features/betting_odds.csv')
# merged_odds = pd.merge(race_results, odds_df, on=['race_id', 'driver_id'])
# from sklearn.metrics import brier_score_loss
# brier = brier_score_loss(merged_odds['actual_points'], merged_odds['predicted_prob'])
# print(f'Brier score for betting market efficiency: {brier:.3f}')