# Predicting Podium Results and Points Results in the F1 Data.

In [None]:
# !pip install --quiet --upgrade pip

In [None]:
# !pip install pandas scikit-learn matplotlib seaborn xgboost lightgbm --quiet

In [None]:
# pip install --upgrade --force-reinstall scipy

In [None]:
# pip install --upgrade pandas --quiet

In [None]:
# pip install --upgrade numpy matplotlib seaborn

In [None]:
# !pip install category_encoders --quiet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

In [None]:
df= pd.read_csv('F1 Races 2020-2024.csv')

In [None]:
df.head()

In [None]:
df.head(25)

In [None]:
df.info()

In [None]:
# Set seaborn style
sns.set(style="whitegrid")

In [None]:
# ===============================================
# Step 3: Distribution of Target Variables
# ===============================================
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot Top 3 Finish distribution
sns.countplot(x="Top 3 Finish", data=df, ax=axes[0], palette="Set2")
axes[0].set_title("Distribution of Podium Finishes (Top 3)")
axes[0].set_xlabel("Top 3 Finish (0 = No, 1 = Yes)")
axes[0].set_ylabel("Count")

# Plot Points distribution
sns.histplot(df["points"], bins=30, ax=axes[1], kde=True, color="skyblue")
axes[1].set_title("Distribution of Race Points")
axes[1].set_xlabel("Points")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

# ===============================================
# Step 4: Correlation Heatmap
# ===============================================
plt.figure(figsize=(14, 10))
corr = df.corr(numeric_only=True)  # Only numeric columns
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.show()

# ===============================================
# Step 5: Grid Position vs Podium Probability
# ===============================================
plt.figure(figsize=(8, 6))
sns.barplot(x="grid", y="Top 3 Finish", data=df, ci=None, palette="Blues_d")
plt.title("Podium Probability by Starting Grid Position")
plt.xlabel("Starting Grid Position")
plt.ylabel("Probability of Podium Finish")
plt.show()

# ===============================================
# Step 6: Weighted Podium Probability vs Actual Top 3 Finish
# ===============================================
plt.figure(figsize=(8, 6))
sns.boxplot(x="Top 3 Finish", y="Weighted_Top_3_Probability", data=df, palette="Set3")
plt.title("Weighted Podium Probability vs Actual Top 3 Finish")
plt.xlabel("Top 3 Finish (0 = No, 1 = Yes)")
plt.ylabel("Weighted Podium Probability")
plt.show()


In [None]:
plt.figure(figsize=(14, 10))

# Compute correlation
corr = df.corr(numeric_only=True)

# Create heatmap
im = plt.imshow(corr, cmap="coolwarm", interpolation="nearest", aspect="auto")
plt.colorbar(im)

# Add ticks
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)

# Title
plt.title("Feature Correlation Heatmap (Matplotlib)", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
dfr = df.copy()

In [None]:
dfr.info()

In [None]:
dfr.head(10)

In [None]:
drop_cols = ['points', 'Top 3 Finish', 'date', 'raceId']
dfr.drop(columns=drop_cols, inplace=True)

In [None]:
dfr.head(10)

In [None]:
# Check total null values per column
null_counts = df.isnull().sum()

print("Null values per column:")
print(null_counts)

# To see columns that actually have missing values (filter zeroes out)
print("\nColumns with missing values:")
print(null_counts[null_counts > 0])

In [None]:
for col in dfr.columns:
    print(col)


In [None]:
# Initialize target encoder for high-cardinality features
target_enc = ce.TargetEncoder(cols=['driverId', 'constructorId'])

# Fit target encoder on your training data
dfr_encoded = target_enc.fit_transform(dfr, y_reg)