## 5. Regression Modelling

#### 5.1 Logistic Regression: Data Preparation


In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

accidents_df = pd.read_csv("/Users/anshureddy/Desktop/dwproject/accidents_cleaned.csv")
accidents_df.columns.tolist()


['ID',
 'Start_Time',
 'End_Time',
 'State',
 'City',
 'Start_Lat',
 'Start_Lng',
 'Temperature(F)',
 'Weather_Condition',
 'Hour']

In [26]:
# Convert time columns to datetime format
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')
accidents_df['End_Time'] = pd.to_datetime(accidents_df['End_Time'], errors='coerce')

# Calculate accident duration in minutes
accidents_df['Duration'] = (accidents_df['End_Time'] - accidents_df['Start_Time']).dt.total_seconds() / 60

# Select relevant columns and drop missing values
relevant_cols = ['Hour', 'Temperature(F)', 'Start_Lat', 'Start_Lng', 'Duration']
regression_df = accidents_df[relevant_cols].dropna()

# Define X (features) and y (target)
X = regression_df[['Hour', 'Temperature(F)', 'Start_Lat', 'Start_Lng']]
y = regression_df['Duration']

#### 5.1 Linear Regression: Model Fitting and Output

In [27]:
# Fit the model
linear_model = LinearRegression()
linear_model.fit(X, y)
y_pred_linear = linear_model.predict(X)

# Evaluate Linear Regression on training data
mse_linear = mean_squared_error(y, y_pred_linear)
r2_linear = r2_score(y, y_pred_linear)

# Print results
print("5.1 Linear Regression: Predicting Duration (minutes)")
print(f"  Mean Squared Error (MSE): {mse_linear:.2f}")
print(f"  R-squared (R²): {r2_linear:.2f}")
print("  Coefficients:")
for feature, coef in zip(X.columns, linear_model.coef_):
    print(f"    {feature}: {coef:.4f}")

5.1 Linear Regression: Predicting Duration (minutes)
  Mean Squared Error (MSE): 176460760.65
  R-squared (R²): 0.00
  Coefficients:
    Hour: 6.5952
    Temperature(F): -5.2291
    Start_Lat: -55.0522
    Start_Lng: 0.3222


#### 5.2 Ridge vs Linear Regression: Data Preparation


In [28]:
from sklearn.linear_model import Ridge

# Fit Ridge Regression model (with L2 regularization)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X, y)
y_pred_ridge = ridge_model.predict(X)

# Evaluate Ridge model performance
mse_ridge = mean_squared_error(y, y_pred_ridge)
r2_ridge = r2_score(y, y_pred_ridge)

# Print Ridge results
print("5.2 Ridge Regression: Predicting Duration (minutes)")
print(f"  Mean Squared Error (MSE): {mse_ridge:.2f}")
print(f"  R-squared (R²): {r2_ridge:.2f}")
print("  Coefficients:")
for feature, coef in zip(X.columns, ridge_model.coef_):
    print(f"    {feature}: {coef:.4f}")

5.2 Ridge Regression: Predicting Duration (minutes)
  Mean Squared Error (MSE): 176460760.65
  R-squared (R²): 0.00
  Coefficients:
    Hour: 6.5952
    Temperature(F): -5.2291
    Start_Lat: -55.0522
    Start_Lng: 0.3222


In [29]:
# Model Comparison Summary
print("Model Comparison: Predicting Duration (minutes)")
print(f"Linear Regression -> MSE: {mse_linear:.2f}, R²: {r2_linear:.2f}")
print(f"Ridge Regression  -> MSE: {mse_ridge:.2f}, R²: {r2_ridge:.2f}")


Model Comparison: Predicting Duration (minutes)
Linear Regression -> MSE: 176460760.65, R²: 0.00
Ridge Regression  -> MSE: 176460760.65, R²: 0.00


#### 5.3 Model Evaluation on Test Data

In [30]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models on training data
linear_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)

# Predict on test set
y_pred_linear_test = linear_model.predict(X_test)
y_pred_ridge_test = ridge_model.predict(X_test)

# Evaluate performance on test data
mse_linear_test = mean_squared_error(y_test, y_pred_linear_test)
r2_linear_test = r2_score(y_test, y_pred_linear_test)

mse_ridge_test = mean_squared_error(y_test, y_pred_ridge_test)
r2_ridge_test = r2_score(y_test, y_pred_ridge_test)

# Print test performance
print("5.3 Model Evaluation on Test Data")
print(f"Linear Regression -> MSE: {mse_linear_test:.2f}, R²: {r2_linear_test:.2f}")
print(f"Ridge Regression  -> MSE: {mse_ridge_test:.2f}, R²: {r2_ridge_test:.2f}")


5.3 Model Evaluation on Test Data
Linear Regression -> MSE: 170495333.51, R²: 0.00
Ridge Regression  -> MSE: 170495333.51, R²: 0.00


## Summary Statistics 

In [31]:
# Summary Statistics for Numeric Variables
# Convert time columns to datetime format to calculate duration
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')
accidents_df['End_Time'] = pd.to_datetime(accidents_df['End_Time'], errors='coerce')

# Calculate accident duration in minutes
accidents_df['Duration'] = (accidents_df['End_Time'] - accidents_df['Start_Time']).dt.total_seconds() / 60

# Select key numeric variables for summary analysis
numeric_cols = ['Duration', 'Temperature(F)', 'Hour', 'Start_Lat', 'Start_Lng']

# Prepare a clean subset with no missing values for accurate statistics
summary_df = accidents_df[numeric_cols].dropna()

# Generate descriptive statistics for each variable
summary_stats = summary_df.describe().T  # Transposed for easier readability

# Include the percentage of missing values from the original dataset
summary_stats['Missing (%)'] = (
    accidents_df[numeric_cols].isnull().sum() / len(accidents_df) * 100
)

# Round values for cleaner display
summary_stats = summary_stats.round(2)

# Display the summary table
print("Summary Statistics for Key Variables:")
summary_stats

Summary Statistics for Key Variables:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Missing (%)
Duration,6805138.0,428.29,13286.26,1.22,30.0,62.4,122.5,2812939.0,0.0
Temperature(F),6805138.0,61.7,18.91,-89.0,49.0,64.0,76.0,207.0,0.0
Hour,6805138.0,12.27,5.45,0.0,8.0,13.0,17.0,23.0,0.0
Start_Lat,6805138.0,36.22,5.07,24.55,33.4,35.79,40.11,49.0,0.0
Start_Lng,6805138.0,-94.76,17.33,-124.62,-117.21,-87.9,-80.42,-67.11,0.0
