In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np

In [None]:
# Load dataset
file_path = '/content/drive/MyDrive/Datasets/dataset_traffic_accident_prediction1.csv'
data = pd.read_csv(file_path)


In [None]:
# Data Preprocessing
# Drop rows with missing target values and fill others
regression_target = 'Accident'
data.dropna(subset=[regression_target], inplace=True)
data.fillna(method='ffill', inplace=True)


  data.fillna(method='ffill', inplace=True)


In [None]:
# Encode categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [None]:
# Standardize numerical features
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.drop([regression_target])
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [None]:
# Feature Selection (Using correlation threshold)
correlation_matrix = data.corr()
correlation_threshold = 0.1
selected_features = correlation_matrix[regression_target][abs(correlation_matrix[regression_target]) > correlation_threshold].index
selected_features = selected_features.drop(regression_target)

X = data[selected_features]
y = data[regression_target]

In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Custom Model from Scratch
class CustomLinearRegression(BaseEstimator, RegressorMixin):
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs

    def fit(self, X, y):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        self.weights = np.zeros(X.shape[1])
        for _ in range(self.epochs):
            predictions = np.dot(X, self.weights)
            gradient = np.dot(X.T, (predictions - y)) / y.size
            self.weights -= self.learning_rate * gradient
        return self

    def predict(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        return np.dot(X, self.weights)

In [None]:
# Model 1: Custom Linear Regression
custom_model = CustomLinearRegression(learning_rate=0.01, epochs=1000)
custom_model.fit(X_train.values, y_train.values)
y_pred_custom = custom_model.predict(X_test.values)
custom_mae = mean_absolute_error(y_test, y_pred_custom)
custom_r2 = r2_score(y_test, y_pred_custom)


In [None]:
# Model 2: Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

In [None]:
# Hyperparameter Optimization with Cross-Validation
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

In [None]:
# Final Model (Best Random Forest)
final_model = best_rf_model
final_model.fit(X_train, y_train)
y_pred_final = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, y_pred_final)
final_r2 = r2_score(y_test, y_pred_final)

In [None]:
# Output Results
print("Custom Linear Regression Results:")
print(f"Mean Absolute Error: {custom_mae:.2f}")
print(f"R2 Score: {custom_r2:.2f}")

print("\nRandom Forest Results:")
print(f"Mean Absolute Error: {rf_mae:.2f}")
print(f"R2 Score: {rf_r2:.2f}")

print("\nFinal Model Results:")
print(f"Mean Absolute Error: {final_mae:.2f}")
print(f"R2 Score: {final_r2:.2f}")

Custom Linear Regression Results:
Mean Absolute Error: 0.42
R2 Score: 0.01

Random Forest Results:
Mean Absolute Error: 0.42
R2 Score: 0.00

Final Model Results:
Mean Absolute Error: 0.42
R2 Score: 0.00
