## 5. Regression Modelling

#### 5.1 Logistic Regression: Data Preparation


In [16]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

accidents_df = pd.read_csv("/Users/anshureddy/Desktop/dwproject/accidents_cleaned.csv")
accidents_df.columns.tolist()


['ID',
 'Start_Time',
 'End_Time',
 'State',
 'City',
 'Start_Lat',
 'Start_Lng',
 'Temperature(F)',
 'Weather_Condition',
 'Hour']

In [17]:
# Convert time columns to datetime format
accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'], errors='coerce')
accidents_df['End_Time'] = pd.to_datetime(accidents_df['End_Time'], errors='coerce')

# Calculate accident duration in minutes
accidents_df['Duration'] = (accidents_df['End_Time'] - accidents_df['Start_Time']).dt.total_seconds() / 60

# Select relevant columns and drop missing values
relevant_cols = ['Hour', 'Temperature(F)', 'Start_Lat', 'Start_Lng', 'Duration']
regression_df = accidents_df[relevant_cols].dropna()

# Define X (features) and y (target)
X = regression_df[['Hour', 'Temperature(F)', 'Start_Lat', 'Start_Lng']]
y = regression_df['Duration']

#### 5.1 Linear Regression: Model Fitting and Output

In [12]:
# Fit the model
model = LinearRegression()
model.fit(X, y)

# Predict on training data
y_pred = model.predict(X)

# Evaluation metrics
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

# Print results
print("Linear Regression: Predicting Duration (minutes)")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  R-squared (R²): {r2:.2f}")
print("  Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"    {feature}: {coef:.4f}")

Linear Regression: Predicting Duration (minutes)
  Mean Squared Error (MSE): 176460760.65
  R-squared (R²): 0.00
  Coefficients:
    Hour: 6.5952
    Temperature(F): -5.2291
    Start_Lat: -55.0522
    Start_Lng: 0.3222
