In [1]:
# Import Libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
# Load the dataset
df = pd.read_csv('car_driving_data.csv')
df.head()

In [3]:
# Check dataset shape
df.shape

In [4]:
# Descriptive statistics
df.describe()

In [5]:
# Check for missing values
df.isnull().sum() * 100 / df.shape[0]

In [6]:
# Visualize outliers using boxplots
fig, axs = plt.subplots(3, figsize=(5, 5))
sns.boxplot(df['Speed'], ax=axs[0])
sns.boxplot(df['Distance'], ax=axs[1])
sns.boxplot(df['Risk'], ax=axs[2])
plt.tight_layout()

In [7]:
# Scatter plot for Risk vs Speed and Distance
plt.scatter(df['Speed'], df['Risk'], label='Speed')
plt.scatter(df['Distance'], df['Risk'], label='Distance')
plt.xlabel('Speed / Distance')
plt.ylabel('Risk')
plt.title('Speed and Distance vs Risk')
plt.legend()
plt.show()

In [8]:
# Pairplot to visualize correlations
sns.pairplot(df, x_vars=['Speed', 'Distance'], y_vars='Risk', height=4, kind='scatter')
plt.show()

In [9]:
# Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap='YlGnBu')
plt.show()

In [10]:
# Prepare data for linear regression
X = df[['Speed', 'Distance']]
y = df['Risk']

reg = LinearRegression()
reg.fit(X, y)

In [11]:
# Retrieve model parameters
coefficients = reg.coef_
intercept = reg.intercept_
print(f'Coefficients: {coefficients}, Intercept: {intercept}')

In [12]:
# Make predictions using the model
predictions = reg.predict(X)
df['Predicted_Risk'] = predictions
df.head()

In [13]:
# Evaluate the model
mse = mean_squared_error(df['Risk'], df['Predicted_Risk'])
mae = mean_absolute_error(df['Risk'], df['Predicted_Risk'])
print(f'MSE: {mse}, MAE: {mae}')

In [14]:
# Visualize the regression results
plt.plot(df['Speed'], df['Predicted_Risk'], color='red', label='Regression Line')
plt.scatter(df['Speed'], df['Risk'], label='Actual Data')
plt.xlabel('Speed')
plt.ylabel('Risk')
plt.title('Speed vs Predicted Risk')
plt.legend()
plt.show()

In [15]:
# Calculate R² score
from sklearn.metrics import r2_score
r2 = r2_score(df['Risk'], df['Predicted_Risk'])
print(f'R² Score: {r2}')