In [None]:
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


This model will predict a racers poll position or laptime for a driver

## Building a Dataset to predict lap times

In [None]:
laptimes = pd.read_csv('/content/lap_times.csv')
laptimes.head(15)

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,1:38.109,98109
1,841,20,2,1,1:33.006,93006
2,841,20,3,1,1:32.713,92713
3,841,20,4,1,1:32.803,92803
4,841,20,5,1,1:32.342,92342
5,841,20,6,1,1:32.605,92605
6,841,20,7,1,1:32.502,92502
7,841,20,8,1,1:32.537,92537
8,841,20,9,1,1:33.240,93240
9,841,20,10,1,1:32.572,92572


In [None]:
drivers = pd.read_csv('/content/drivers.csv')
drivers = drivers.drop(columns=['driverRef', 'number', 'code', 'dob', 'nationality', 'url'])
drivers.head()

Unnamed: 0,driverId,forename,surname
0,1,Lewis,Hamilton
1,2,Nick,Heidfeld
2,3,Nico,Rosberg
3,4,Fernando,Alonso
4,5,Heikki,Kovalainen


In [None]:
circuits = pd.read_csv('/content/circuits.csv')
circuits.head()

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


In [None]:
results = pd.read_csv('/content/results.csv')
results = results.drop(columns=['number', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'constructorId', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId'])
results.head(2)

Unnamed: 0,resultId,raceId,driverId,time,milliseconds
0,1,18,1,1:34:50.616,5690616
1,2,18,2,+5.478,5696094


In [None]:
races = pd.read_csv('/content/races.csv')
races = races.drop(columns=['name', 'date', 'time', 'url', 'fp1_date', 'fp1_time','fp2_date', 'fp2_time','fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'])
races.head(2)

Unnamed: 0,raceId,year,round,circuitId
0,1,2009,1,1
1,2,2009,2,2


In [None]:
laptimed = pd.merge(races, laptimes, on='raceId')
laptimed.head(2)

Unnamed: 0,raceId,year,round,circuitId,driverId,lap,position,time,milliseconds
0,1,2009,1,1,1,1,13,1:49.088,109088
1,1,2009,1,1,1,2,12,1:33.740,93740


In [None]:
laptimedrivers = laptimed.merge(drivers, on='driverId')
laptimed.head(10)

Unnamed: 0,raceId,year,round,circuitId,driverId,lap,position,time,milliseconds
0,1,2009,1,1,1,1,13,1:49.088,109088
1,1,2009,1,1,1,2,12,1:33.740,93740
2,1,2009,1,1,1,3,11,1:31.600,91600
3,1,2009,1,1,1,4,10,1:31.067,91067
4,1,2009,1,1,1,5,10,1:32.129,92129
5,1,2009,1,1,1,6,9,1:30.469,90469
6,1,2009,1,1,1,7,9,1:29.488,89488
7,1,2009,1,1,1,8,9,1:30.302,90302
8,1,2009,1,1,1,9,9,1:30.889,90889
9,1,2009,1,1,1,10,8,1:32.418,92418


In [None]:
laptimedrivers.dropna(inplace=True)

In [None]:
laptimeresults = laptimedrivers.merge(results, on='raceId', suffixes=('', '_y'))
laptimeresults = laptimeresults.drop(columns=['driverId_y', 'time_y', 'milliseconds_y', 'resultId'])
laptimeresults.drop_duplicates(inplace=True)
laptimeresults.head(5)

Unnamed: 0,raceId,year,round,circuitId,driverId,lap,position,time,milliseconds,forename,surname
0,1,2009,1,1,1,1,13,1:49.088,109088,Lewis,Hamilton
20,1,2009,1,1,1,2,12,1:33.740,93740,Lewis,Hamilton
40,1,2009,1,1,1,3,11,1:31.600,91600,Lewis,Hamilton
60,1,2009,1,1,1,4,10,1:31.067,91067,Lewis,Hamilton
80,1,2009,1,1,1,5,10,1:32.129,92129,Lewis,Hamilton


In [None]:
laptimeresults.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575029 entries, 0 to 12201284
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   raceId        575029 non-null  int64 
 1   year          575029 non-null  int64 
 2   round         575029 non-null  int64 
 3   circuitId     575029 non-null  int64 
 4   driverId      575029 non-null  int64 
 5   lap           575029 non-null  int64 
 6   position      575029 non-null  int64 
 7   time          575029 non-null  object
 8   milliseconds  575029 non-null  int64 
 9   forename      575029 non-null  object
 10  surname       575029 non-null  object
dtypes: int64(8), object(3)
memory usage: 52.6+ MB


In [None]:
laptimeresults.dropna(inplace=True)

In [None]:
laptimeresults.to_csv('laptimeresults.csv')

## Building a Model

In [None]:
# Run this if you're skipping data merging
laptimeresults = pd.read_csv('/content/laptimeresults.csv')
laptimeresults.head()

Unnamed: 0.1,Unnamed: 0,raceId,year,round,circuitId,driverId,lap,position,time,milliseconds,forename,surname
0,0,1,2009,1,1,1,1,13,1:49.088,109088,Lewis,Hamilton
1,20,1,2009,1,1,1,2,12,1:33.740,93740,Lewis,Hamilton
2,40,1,2009,1,1,1,3,11,1:31.600,91600,Lewis,Hamilton
3,60,1,2009,1,1,1,4,10,1:31.067,91067,Lewis,Hamilton
4,80,1,2009,1,1,1,5,10,1:32.129,92129,Lewis,Hamilton


In [None]:
features = ['raceId', 'driverId', 'circuitId']
target = 'milliseconds'

X = laptimeresults[features]
y = laptimeresults[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.predict(X_test)

array([92579.97285353, 98800.48860107, 96960.01340765, ...,
       93028.00273057, 91508.60488502, 98533.53098975])

In [None]:
model.score(X_test, y_test)

0.0018540963199581428

## Testing the Prediction Output

In [None]:
# Create a new data point for prediction
new_data = pd.DataFrame({'raceId': [1000], 'driverId': [1], 'circuitId': [1]})  # Replace with desired values

# Predict lap time
predicted_laptime = model.predict(new_data)

#To see the output, run the code.
print(predicted_laptime)

[101067.32355979]


In [None]:
def predict_laptime(driverId, circuitId, min_raceId=1, max_raceId = 1083):  # Updated to accept driverId, circuitId and raceId as input, the raceId defaults to the last race.
    """Predicts lap time for a given driver and circuit using the trained model."""
    raceId = random.randint(min_raceId, max_raceId)

    # Create input DataFrame
    input_data = pd.DataFrame({'raceId': [raceId], 'driverId': [driverId], 'circuitId': [circuitId]})

    # Predict lap time
    predicted_laptime_ms = model.predict(input_data)[0]

    # Convert milliseconds to minutes, seconds, milliseconds
    minutes, seconds, milliseconds = convert_ms_to_time(predicted_laptime_ms)

    # Get driver and circuit names
    driver_name = drivers[drivers['driverId'] == driverId]['forename'].iloc[0] + " " + drivers[drivers['driverId'] == driverId]['surname'].iloc[0]
    circuit_name = circuits[circuits['circuitId'] == circuitId]['name'].iloc[0]

    # Return the prediction as a formatted string
    return f"{driver_name}'s fastest lap on {circuit_name} will be {minutes:02d}:{seconds:02d}.{milliseconds:03d}."


def convert_ms_to_time(milliseconds):
    """Converts milliseconds to minutes, seconds, and milliseconds."""
    minutes = int(milliseconds // 60000)
    seconds = int((milliseconds % 60000) // 1000)
    milliseconds = int(milliseconds % 1000)
    return minutes, seconds, milliseconds

# Example usage:
# To see the output, run the code.
#driverId = 830  # Replace with actual driver ID
#circuitId = 24  # Replace with actual circuit ID
#predicted_time = predict_laptime(driverId, circuitId)
#print(f"Predicted lap time: {predicted_time}")

In [None]:
predict_laptime(driverId=18, circuitId=12)

"Jenson Button's fastest lap on Valencia Street Circuit will be 01:40.696."