## 5. Regression Modelling

### Predicting Accident Frequency Based on Weather, Temperature, State, and City)

#### 5.1 Linear Regression: Data Preparation


In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
accidents_df = pd.read_csv("/Users/anshureddy/Desktop/dwproject/accidents_cleaned.csv")

# Convert categorical variables to numeric using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
accidents_df['Weather_Condition'] = label_encoder.fit_transform(accidents_df['Weather_Condition'])
accidents_df['State'] = label_encoder.fit_transform(accidents_df['State'])
accidents_df['City'] = label_encoder.fit_transform(accidents_df['City'])

# Group accidents by 'Hour' and calculate the frequency (count) of accidents per hour
accidents_by_hour = accidents_df.groupby('Hour').size().reset_index(name='Accident_Frequency')

# Merge the accident frequency with the original dataset on 'Hour'
accidents_hourly = pd.merge(accidents_df, accidents_by_hour[['Hour', 'Accident_Frequency']], on='Hour')

# Select relevant features for prediction
X = accidents_hourly[['Weather_Condition', 'Temperature(F)', 'State', 'City']]
y = accidents_hourly['Accident_Frequency']


#### 5.1 Linear Regression: Model Fitting and Output

In [4]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict accident frequency on the test set
y_pred_linear = linear_model.predict(X_test)

# Evaluate the Linear Regression model
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Print the results
print("Linear Regression Results - Predicting Accident Frequency:")
print(f"  Mean Squared Error (MSE): {mse_linear:.2f}")
print(f"  R-squared (R²): {r2_linear:.2f}")

# Print the model coefficients
print("Linear Regression Coefficients - Influence of Each Feature on Accident Frequency:")
for feature, coef in zip(X.columns, linear_model.coef_):
    print(f"  {feature}: {coef:.4f}")

# Include explanation for Weather_Condition coefficient
print("\nExplanation of Features:")
print("  - Weather_Condition is numerically encoded (e.g., 0 = Clear, 1 = Rainy, 2 = Snowy, etc.).")
print("  - For each unit increase in Weather_Condition, the predicted accident frequency increases by 230.45 accidents.")
print("  - Temperature(F) represents the effect of temperature on accidents, with each 1°F increase leading to an increase of 777.99 accidents.")
print("  - State is encoded numerically, with each unit increase in State leading to an increase of 439.48 accidents.")
print("  - City has a negative relationship, with each unit increase in City corresponding to a decrease of 0.0574 accidents.")

Linear Regression Results - Predicting Accident Frequency:
  Mean Squared Error (MSE): 17968374622.93
  R-squared (R²): 0.02
Linear Regression Coefficients - Influence of Each Feature on Accident Frequency:
  Weather_Condition: 230.4524
  Temperature(F): 777.9901
  State: 439.4800
  City: -0.0574

Explanation of Features:
  - Weather_Condition is numerically encoded (e.g., 0 = Clear, 1 = Rainy, 2 = Snowy, etc.).
  - For each unit increase in Weather_Condition, the predicted accident frequency increases by 230.45 accidents.
  - Temperature(F) represents the effect of temperature on accidents, with each 1°F increase leading to an increase of 777.99 accidents.
  - State is encoded numerically, with each unit increase in State leading to an increase of 439.48 accidents.
  - City has a negative relationship, with each unit increase in City corresponding to a decrease of 0.0574 accidents.


#### 5.2 Ridge vs Linear Regression: Data Preparation


In [6]:
from sklearn.linear_model import Ridge

# Fit Ridge Regression model (with L2 regularization)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Predict accident frequency on the test set
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate the Ridge model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print the results
print("Ridge Regression Results - Predicting Accident Frequency:")
print(f"  Mean Squared Error (MSE): {mse_ridge:.2f}")
print(f"  R-squared (R²): {r2_ridge:.2f}")

print("Ridge Regression Coefficients - Influence of Each Feature on Accident Frequency:")
for feature, coef in zip(X.columns, ridge_model.coef_):
    print(f"  {feature}: {coef:.4f}")

# Include explanation for Weather_Condition coefficient in Ridge Regression
print("\nExplanation of Features:")
print("  - Weather_Condition is numerically encoded (e.g., 0 = Clear, 1 = Rainy, 2 = Snowy, etc.).")
print("  - For each unit increase in Weather_Condition, the predicted accident frequency increases by 230.45 accidents.")
print("  - Temperature(F) represents the effect of temperature on accidents, with each 1°F increase leading to an increase of 777.99 accidents.")
print("  - State is encoded numerically, with each unit increase in State leading to an increase of 439.48 accidents.")
print("  - City has a negative relationship, with each unit increase in City corresponding to a decrease of 0.0574 accidents.")

Ridge Regression Results - Predicting Accident Frequency:
  Mean Squared Error (MSE): 17968374622.94
  R-squared (R²): 0.02
Ridge Regression Coefficients - Influence of Each Feature on Accident Frequency:
  Weather_Condition: 230.4524
  Temperature(F): 777.9901
  State: 439.4800
  City: -0.0574

Explanation of Features:
  - Weather_Condition is numerically encoded (e.g., 0 = Clear, 1 = Rainy, 2 = Snowy, etc.).
  - For each unit increase in Weather_Condition, the predicted accident frequency increases by 230.45 accidents.
  - Temperature(F) represents the effect of temperature on accidents, with each 1°F increase leading to an increase of 777.99 accidents.
  - State is encoded numerically, with each unit increase in State leading to an increase of 439.48 accidents.
  - City has a negative relationship, with each unit increase in City corresponding to a decrease of 0.0574 accidents.


#### 5.3 Model Evaluation on Test Data

In [8]:
# Evaluate performance on test data
mse_linear_test = mean_squared_error(y_test, y_pred_linear)
r2_linear_test = r2_score(y_test, y_pred_linear)

mse_ridge_test = mean_squared_error(y_test, y_pred_ridge)
r2_ridge_test = r2_score(y_test, y_pred_ridge)

# Print test performance
print("Model Evaluation on Test Data - Comparison of Linear and Ridge Regression:")
print(f"Linear Regression -> MSE: {mse_linear_test:.2f}, R²: {r2_linear_test:.2f}")
print(f"Ridge Regression  -> MSE: {mse_ridge_test:.2f}, R²: {r2_ridge_test:.2f}")


Model Evaluation on Test Data - Comparison of Linear and Ridge Regression:
Linear Regression -> MSE: 17968374622.93, R²: 0.02
Ridge Regression  -> MSE: 17968374622.94, R²: 0.02


## Summary Statistics 

In [10]:
import pandas as pd

# Load the dataset
accidents_df = pd.read_csv("/Users/anshureddy/Desktop/dwproject/accidents_cleaned.csv")

# Summary statistics for numerical columns (Temperature(F), Start_Lat, Start_Lng, Hour)
numerical_summary = accidents_df[['Temperature(F)', 'Start_Lat', 'Start_Lng', 'Hour']].describe()
print("Numerical Data Summary:")
print(numerical_summary)

# Summary statistics for categorical columns (State, City, Weather_Condition)
categorical_summary = accidents_df[['State', 'City', 'Weather_Condition']].describe()
print("\nCategorical Data Summary:")
print(categorical_summary)

# Check for missing values
missing_values = accidents_df.isnull().sum()  # Count of missing values in each column
print("\nMissing Values:")
print(missing_values)

# Correlation matrix for numerical columns
correlation_matrix = accidents_df[['Temperature(F)', 'Start_Lat', 'Start_Lng', 'Hour']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


Numerical Data Summary:
       Temperature(F)     Start_Lat     Start_Lng          Hour
count    6.805138e+06  6.805138e+06  6.805138e+06  6.805138e+06
mean     6.169845e+01  3.621612e+01 -9.476481e+01  1.227399e+01
std      1.891321e+01  5.073536e+00  1.732566e+01  5.446748e+00
min     -8.900000e+01  2.455480e+01 -1.246238e+02  0.000000e+00
25%      4.900000e+01  3.340301e+01 -1.172136e+02  8.000000e+00
50%      6.400000e+01  3.579237e+01 -8.790092e+01  1.300000e+01
75%      7.600000e+01  4.011233e+01 -8.042281e+01  1.700000e+01
max      2.070000e+02  4.900220e+01 -6.711317e+01  2.300000e+01

Categorical Data Summary:
          State     City Weather_Condition
count   6805138  6805138           6805138
unique       49    13150               142
top          CA  Houston              Fair
freq    1521976   157540           2196786

Missing Values:
ID                   0
Start_Time           0
End_Time             0
State                0
City                 0
Start_Lat            0
Sta