In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset with necessary convertions
df = pd.read_csv("winners_f1_1950_2025_v2.csv")
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_timedelta(df['time'], errors='coerce')
df['time_segundos'] = df['time'].dt.total_seconds()

# Set top 10
top_10_pilotos = df['winner_name'].value_counts().head(10).index.tolist()
df_top_10 = df[df['winner_name'].isin(top_10_pilotos)]

print("Dataframe info:")
df.info()

# Treating nulls with medium value
df['time'] = df['time'].fillna(pd.Timedelta(seconds=0))

# Create "seconds" column
df['time_segundos'] = df['time'].dt.total_seconds()

# Checking nulls
print(df.isnull().sum())

# Top 10
top_10_pilotos = df['winner_name'].value_counts().head(10).index.tolist()
df_top_10 = df[df['winner_name'].isin(top_10_pilotos)].copy()

Dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1142 entries, 0 to 1141
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   date           1142 non-null   datetime64[ns] 
 1   continent      1142 non-null   object         
 2   grand_prix     1142 non-null   object         
 3   circuit        1142 non-null   object         
 4   winner_name    1142 non-null   object         
 5   team           1142 non-null   object         
 6   time           1140 non-null   timedelta64[ns]
 7   laps           1142 non-null   float64        
 8   year           1142 non-null   int64          
 9   time_segundos  1140 non-null   float64        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5), timedelta64[ns](1)
memory usage: 89.3+ KB
date             0
continent        0
grand_prix       0
circuit          0
winner_name      0
team             0
time             0
laps             0

In [None]:
# In this project, I performed comparative analyses to determine the best F1 Driver.
# To do this, I established some key criteria, ran the analyses in this section, and began planning a scoring system for the next step.

In [12]:
# --- Analysis 1: Versatility in Teams and Circuits ---
# This analysis evaluates a driver's ability to win with different teams and at various circuits.

# Create an empty list to store the results for each driver.
results_list = []

# Iterate through each of the top 10 winning drivers.
for driver in top_10_pilotos:
    # Create a new DataFrame containing only the wins for the current driver.
    driver_df = df_top_10[df_top_10['winner_name'] == driver]

    # Count the number of teams with which the driver achieved at least 5 wins.
    # 'value_counts()' counts the number of wins per team.
    team_counts = driver_df['team'].value_counts()
    # 'len(team_counts[team_counts >= 5])' filters for teams with 5 or more wins
    # and then counts how many such teams exist.
    unique_teams_5_wins = len(team_counts[team_counts >= 5])

    # Count the number of circuits where the driver achieved at least 3 wins.
    # 'value_counts()' counts the number of wins per circuit.
    circuit_counts = driver_df['circuit'].value_counts()
    # 'len(circuit_counts[circuit_counts >= 3])' filters for circuits with 3 or more wins
    # and counts how many such circuits exist.
    unique_circuits_3_wins = len(circuit_counts[circuit_counts >= 3])

    # Append the calculated results for the current driver to our list.
    results_list.append({
        'winner_name': driver,
        'unique_teams_5_wins': unique_teams_5_wins,
        'unique_circuits_3_wins': unique_circuits_3_wins
    })

# Create a DataFrame from the results list.
# This converts our list of dictionaries into a structured DataFrame.
versatility_df = pd.DataFrame(results_list)

# Calculate the versatility score.
# The score is a simple sum of the number of unique teams and circuits.
versatility_df['versatility_score'] = versatility_df['unique_teams_5_wins'] + versatility_df['unique_circuits_3_wins']

# Sort the drivers based on their versatility score in descending order.
versatility_df = versatility_df.sort_values(by='versatility_score', ascending=False)

# Assign points based on the ranking.
# This creates a sequence of points from 10 down to 1.
versatility_df['points'] = range(10, 0, -1)

# Display the result of the versatility analysis.
print("Team and Circuit Versatility Analysis (Top 10):")
print(versatility_df)

Team and Circuit Versatility Analysis (Top 10):
          winner_name  unique_teams_5_wins  unique_circuits_3_wins  \
1  Michael Schumacher                    3                      16   
0      Lewis Hamilton                    2                      16   
2      Max Verstappen                    4                      12   
3    Sebastian Vettel                    3                      11   
4         Alain Prost                    5                       8   
5        Ayrton Senna                    2                       6   
9           Jim Clark                    2                       4   
6     Fernando Alonso                    2                       3   
8      Jackie Stewart                    2                       3   
7       Nigel Mansell                    2                       2   

   versatility_score  points  
1                 19      10  
0                 18       9  
2                 16       8  
3                 14       7  
4                 13      

In [13]:
# --- Analysis 2: Big Stage Wins Analysis ---
# This section focuses on victories at historically significant circuits.

# Define the list of important circuits.
important_circuits = [
    'Circuit de Spa Francorchamps',
    'Circuit de Monaco',
    'Autodromo Nazionale di Monza',
    'Autodromo José Carlos Pace',
    'Silverstone Circuit'
]

# Filter the DataFrame to include only wins at the important circuits and by the top 10 drivers.
# The `&` operator combines two conditions.
# '.copy()' is used to avoid a SettingWithCopyWarning in pandas.
df_important_wins = df[
    df['circuit'].isin(important_circuits) &
    df['winner_name'].isin(top_10_pilotos)
].copy()

# Group by driver and count the number of wins at these circuits.
# 'groupby('winner_name')' groups the data by driver.
# '.count()' counts the number of rows for each driver within the group.
# '.reset_index(name='important_wins')' turns the result into a new DataFrame with a column named 'important_wins'.
important_wins_count = df_important_wins.groupby('winner_name')['grand_prix'].count().reset_index()
important_wins_count.rename(columns={'grand_prix': 'important_wins'}, inplace=True) # A 'grand_prix' in the previous line is not needed, but this line ensures the column is properly named. It could also have been `size().reset_index(name='important_wins')`.

# Create a DataFrame to ensure all 10 top drivers are included in the analysis,
# even if they have no wins on the important circuits.
all_top_10_drivers_df = pd.DataFrame({'winner_name': top_10_pilotos})
# Merge the count of important wins with the full list of top 10 drivers.
# 'how='left'' ensures all drivers from the 'all_top_10_drivers_df' are kept.
# '.fillna(0)' replaces any 'NaN' (missing) values with 0, so drivers without important wins are not excluded.
final_ranking = pd.merge(all_top_10_drivers_df, important_wins_count, on='winner_name', how='left').fillna(0)

# Sort the ranking based on the number of important wins, from most to least.
final_ranking = final_ranking.sort_values(by='important_wins', ascending=False)

# Assign points based on the ranking.
# This creates a sequence of points from 10 down to 1.
final_ranking['points'] = range(10, 0, -1)

# Print the final result for this analysis.
print("Analysis of Great Moments (Victories in Important Circuits):")
print(final_ranking)

Analysis of Great Moments (Victories in Important Circuits):
          winner_name  important_wins  points
0      Lewis Hamilton              25      10
1  Michael Schumacher              23       9
5        Ayrton Senna              17       8
2      Max Verstappen              13       7
3    Sebastian Vettel              13       6
4         Alain Prost              13       5
9           Jim Clark               8       4
8      Jackie Stewart               7       3
7       Nigel Mansell               6       2
6     Fernando Alonso               6       1


In [14]:
# --- Analysis 3: Circuit Dominance Analysis ---
# This analysis identifies which drivers are the 'top winner' on the most circuits.

# Count the number of victories for each driver on each circuit.
# 'groupby()' groups the data by both circuit and winner.
# '.size()' counts the number of rows (wins) for each group.
wins_per_circuit = df.groupby(['circuit', 'winner_name']).size().reset_index(name='wins')

# Find the maximum number of wins for each circuit.
# This identifies the record holder(s) for each track.
max_wins_per_circuit = wins_per_circuit.groupby('circuit')['wins'].max().reset_index(name='max_wins')

# Join the tables to identify the drivers who hold the record for the most wins per circuit.
top_winners_df = pd.merge(wins_per_circuit, max_wins_per_circuit, on='circuit')

# Filter the DataFrame to get only the drivers who equal the maximum number of wins for each circuit (including ties).
top_winners_df = top_winners_df[top_winners_df['wins'] == top_winners_df['max_wins']]

# Count how many circuits each driver is a 'top winner' on.
circuits_top_winner_count = top_winners_df.groupby('winner_name').size().reset_index(name='top_winner_circuits')

# Get the list of the top 10 overall winners to ensure they are all included in this ranking.
top_10_overall_pilots = df['winner_name'].value_counts().head(10).index.tolist()

# Create a DataFrame with the top 10 pilots.
all_top_10_pilots_df = pd.DataFrame({'winner_name': top_10_overall_pilots})

# Merge the results with the list of the top 10, filling with 0 for drivers who are not 'top winners' on any circuit.
# 'how='left'' ensures that all drivers from the 'all_top_10_pilots_df' are kept.
final_ranking = pd.merge(all_top_10_pilots_df, circuits_top_winner_count, on='winner_name', how='left').fillna(0)

# Rank the results based on the number of circuits a driver dominates.
final_ranking = final_ranking.sort_values(by='top_winner_circuits', ascending=False)

# --- START OF THE CORRECTION ---
# Assign points based on the ranking, correctly handling ties.
# 'rank(method='dense')' gives the same rank to tied values (e.g., two drivers tied for 1st both get rank 1).
final_ranking['rank'] = final_ranking['top_winner_circuits'].rank(method='dense', ascending=False).astype(int)
# This mapping assigns points based on the dense rank.
# For example, a dense rank of 1 maps to 10 points (11-1), 2 maps to 9 points (11-2), etc.
points_map = {rank: 11 - rank for rank in final_ranking['rank'].unique()}
final_ranking['points'] = final_ranking['rank'].map(points_map)
# --- END OF THE CORRECTION ---

# Print the final result for this analysis.
print("Driver Analysis - Biggest Track Winner:")
print(final_ranking)

Driver Analysis - Biggest Track Winner:
          winner_name  top_winner_circuits  rank  points
1  Michael Schumacher                   13     1      10
0      Lewis Hamilton                   12     2       9
2      Max Verstappen                    6     3       8
4         Alain Prost                    6     3       8
3    Sebastian Vettel                    5     4       7
5        Ayrton Senna                    5     4       7
9           Jim Clark                    3     5       6
8      Jackie Stewart                    3     5       6
7       Nigel Mansell                    1     6       5
6     Fernando Alonso                    1     6       5


In [None]:
# --- Analysis 4: Longevity Analysis (8+ Wins per Year Scoring) ---
# This analysis measures a driver's career longevity by identifying periods of sustained dominance.

# Count the number of wins for each driver per year.
# 'groupby()' groups the data by 'winner_name' and 'year'.
# '.size()' counts the number of wins for each unique combination.
wins_per_year = df.groupby(['winner_name', 'year']).size().reset_index(name='wins')

# Filter for the years where a pilot had at least 8 wins.
# This identifies the years of peak performance.
longevity_years = wins_per_year[wins_per_year['wins'] >= 8]

# Calculate the longevity period for each driver.
# We group by driver and find the first and last year with 8+ wins.
longevity_period = (
    longevity_years.groupby('winner_name')['year']
    # 'agg()' is used to apply multiple aggregation functions at once.
    .agg(min_year='min', max_year='max')
    .reset_index()
)
# Calculate the total number of years in the longevity period.
longevity_period['longevity_period_years'] = longevity_period['max_year'] - longevity_period['min_year'] + 1

# Get the list of the top 10 overall pilots to ensure everyone is included in the ranking.
top_10_overall_pilots = df['winner_name'].value_counts().head(10).index.tolist()
all_top_10_pilots_df = pd.DataFrame({'winner_name': top_10_overall_pilots})

# Merge the longevity results with the full list of top 10 pilots, filling with 0 where there are no longevity years.
# 'how='left'' ensures that all top 10 pilots are in the final DataFrame.
final_ranking = pd.merge(all_top_10_pilots_df, longevity_period[['winner_name', 'longevity_period_years']], on='winner_name', how='left').fillna(0)

# Sort the results by the longevity period, from longest to shortest.
final_ranking = final_ranking.sort_values(by='longevity_period_years', ascending=False)

# Assign points, handling ties correctly.
# 'rank(method='dense')' assigns a rank to each pilot.
final_ranking['rank'] = final_ranking['longevity_period_years'].rank(method='dense', ascending=False).astype(int)
# Get a unique list of ranks to create the points map.
unique_ranks = final_ranking['rank'].unique()
# Create a dictionary to map each rank to a point value (e.g., Rank 1 -> 10 points).
points_map = {rank: 11 - rank for rank in unique_ranks}
# Apply the points map to the DataFrame.
final_ranking['points'] = final_ranking['rank'].map(points_map)

# Print the final result for this analysis.
print("Longevity Analysis (8+ Wins per Year Score):")
print(final_ranking[['winner_name', 'longevity_period_years', 'points']])

Análise de Longevidade (Pontuação de 8+ Vitórias por Ano):
          winner_name  longevity_period_years  points
1  Michael Schumacher                    11.0      10
0      Lewis Hamilton                     8.0       9
2      Max Verstappen                     4.0       8
3    Sebastian Vettel                     3.0       7
5        Ayrton Senna                     1.0       6
7       Nigel Mansell                     1.0       6
4         Alain Prost                     0.0       5
6     Fernando Alonso                     0.0       5
8      Jackie Stewart                     0.0       5
9           Jim Clark                     0.0       5


In [None]:
# --- Analysis 5: Efficiency in Wins Analysis ---
# This analysis measures how "efficiently" a driver wins a race, based on their average lap time during winning races.

# Calculate the average time per lap for each victory.
# The 'time_segundos' column (total race time in seconds) is divided by the number of laps.
df_top_10['avg_time_per_lap'] = df_top_10['time_segundos'] / df_top_10['laps']

# Calculate the overall average time per lap for each driver.
# 'groupby('winner_name')' groups the data by driver.
# '.mean()' calculates the average of the 'avg_time_per_lap' column for each driver.
# '.reset_index()' converts the grouped result back into a DataFrame.
efficiency_ranking = df_top_10.groupby('winner_name')['avg_time_per_lap'].mean().reset_index()
# Rename the column for clarity.
efficiency_ranking.rename(columns={'avg_time_per_lap': 'avg_time_per_lap_s'}, inplace=True)

# Sort the results by average time per lap, from lowest (fastest) to highest (slowest).
# A lower time indicates greater efficiency.
efficiency_ranking = efficiency_ranking.sort_values(by='avg_time_per_lap_s', ascending=True)

# Assign points, handling ties correctly.
# We use 'rank(method='dense', ascending=True)' because a lower time (ascending=True) is better.
efficiency_ranking['rank'] = efficiency_ranking['avg_time_per_lap_s'].rank(method='dense', ascending=True).astype(int)
# Get a unique list of ranks to create the points map.
unique_ranks = efficiency_ranking['rank'].unique()
# Create a dictionary to map each rank to a point value (e.g., Rank 1 -> 10 points).
points_map = {rank: 11 - rank for rank in unique_ranks}
# Apply the points map to the DataFrame.
efficiency_ranking['points'] = efficiency_ranking['rank'].map(points_map)

# Print the final result for this analysis.
print("Win Efficiency Analysis:")
print(efficiency_ranking)

Análise de Eficiência em Vitórias:
          winner_name  avg_time_per_lap_s  rank  points
8       Nigel Mansell           86.566912     1      10
0         Alain Prost           89.597847     2       9
7  Michael Schumacher           89.990717     3       8
6      Max Verstappen           96.987125     4       7
1        Ayrton Senna           98.023289     5       6
2     Fernando Alonso           99.057464     6       5
5      Lewis Hamilton           99.605794     7       4
9    Sebastian Vettel           99.742728     8       3
3      Jackie Stewart          140.611066     9       2
4           Jim Clark          142.966054    10       1


In [None]:
# --- Analysis 6: Longest Winning Streak Analysis ---
# This analysis identifies the longest consecutive winning streak for each driver.

# Sort the entire DataFrame by date to identify sequences of wins.
df_sorted = df.sort_values(by='date').reset_index(drop=True)

# Identify the start of a new streak (when the winner changes).
# 'df_sorted['winner_name'] != df_sorted['winner_name'].shift(1)' compares each driver with the previous one.
# It returns 'True' when a new driver wins.
# '.cumsum()' creates a unique group ID for each consecutive streak.
df_sorted['is_new_streak'] = (df_sorted['winner_name'] != df_sorted['winner_name'].shift(1)).cumsum()

# Count the length of each winning streak.
# We group by the winner and the streak ID and count the number of wins in each streak.
winning_streaks = df_sorted.groupby(['winner_name', 'is_new_streak']).size().reset_index(name='streak_length')

# Find the longest streak for each driver.
# We group by the driver and find the maximum streak length.
longest_streaks = winning_streaks.groupby('winner_name')['streak_length'].max().reset_index()

# Get the list of the top 10 overall drivers from the complete dataset.
top_10_pilotos = df['winner_name'].value_counts().head(10).index.tolist()

# Filter the streak analysis to include only the top 10 drivers.
final_ranking = longest_streaks[longest_streaks['winner_name'].isin(top_10_pilotos)].copy()

# Sort the results from the longest streak to the shortest.
final_ranking = final_ranking.sort_values(by='streak_length', ascending=False)

# Assign points, handling ties.
# 'rank(method='dense')' gives the same rank to drivers with the same streak length.
final_ranking['rank'] = final_ranking['streak_length'].rank(method='dense', ascending=False).astype(int)
# Get a unique list of ranks to create the points map.
unique_ranks = final_ranking['rank'].unique()
# Create a dictionary to map each rank to a point value (e.g., Rank 1 -> 10 points).
points_map = {rank: 11 - rank for rank in unique_ranks}
# Apply the points map to the DataFrame.
final_ranking['points'] = final_ranking['rank'].map(points_map)

# Print the final result for this analysis.
print("Longest Winning Streak Analysis:")
print(final_ranking[['winner_name', 'streak_length', 'points']])

Análise de Maior Sequência de Vitórias:
            winner_name  streak_length  points
74       Max Verstappen             10      10
106    Sebastian Vettel              9       9
75   Michael Schumacher              7       8
81        Nigel Mansell              5       7
48            Jim Clark              5       7
66       Lewis Hamilton              5       7
4          Ayrton Senna              4       6
0           Alain Prost              4       6
24      Fernando Alonso              4       6
36       Jackie Stewart              3       5
