In [1]:
!pip install pandasql



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from pandasql import sqldf
import warnings
import geopandas as gpd
import tensorflow as tf
import plotly.graph_objects as go
from nltk.corpus import stopwords

from sklearn.utils import resample
import torch

In [3]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

In [4]:
path = "/content/drive/MyDrive/tubitak/datasets/preprocessed_final.csv"
df = pd.read_csv(path)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6622775 entries, 0 to 6622774
Data columns (total 27 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Start_Lat          float64
 2   Start_Lng          float64
 3   Distance(mi)       float64
 4   Temperature(F)     float64
 5   Humidity(%)        float64
 6   Pressure(in)       float64
 7   Visibility(mi)     float64
 8   Wind_Speed(mph)    float64
 9   Population         float64
 10  Month              int64  
 11  Day                int64  
 12  Weekday            int64  
 13  Hour               int64  
 14  Weather_Condition  object 
 15  Amenity            bool   
 16  Bump               bool   
 17  Crossing           bool   
 18  Give_Way           bool   
 19  Junction           bool   
 20  No_Exit            bool   
 21  Railway            bool   
 22  Roundabout         bool   
 23  Station            bool   
 24  Stop               bool   
 25  Traffic_Calming   

In [6]:
!pip install esda



In [7]:
from esda.moran import Moran_Local
from libpysal.weights import Queen
import geopandas as gpd
from sklearn.neighbors import KernelDensity
from scipy.stats import gaussian_kde

In [8]:
def kdeAlgorithm(kernel_algorithm: list, bandwidth_meters, coords, buffer_meters, resolution, df):
    """
    Perform Kernel Density Estimation (KDE) with specified algorithms and add KDE scores to the DataFrame,
    allowing for bandwidth and buffer to be defined in meters.
    Belirtilen algoritmalarla Kernel Yoğunluk Tahmini (KDE) yapar, bant genişliği ve tampon değerlerini metre cinsinden hesaplar.

    Parameters:
    - kernel_algorithm (list): List of kernel algorithms to use (e.g., ['gaussian', 'epanechnikov']).
                               Kullanılacak çekirdek algoritmaların listesi (örneğin, ['gaussian', 'epanechnikov']).
    - bandwidth_meters (float): Bandwidth in meters for KDE, controls smoothness of the density estimation.
                                KDE için bant genişliği, yoğunluk tahmininin yumuşaklığını kontrol eder (metre cinsinden).
    - coords (numpy.ndarray): Coordinates (longitude, latitude) for the points to analyze.
                              Analiz edilecek noktaların koordinatları (boylam, enlem).
    - buffer_meters (float): Buffer value in meters to extend the grid beyond the data points.
                             Grid'i veri noktalarının ötesine genişletmek için tampon değeri (metre cinsinden).
    - resolution (int): Number of grid points along each axis, determines detail of the map.
                        Her eksen boyunca grid noktalarının sayısı, haritanın detayını belirler.
    - df (pandas.DataFrame): DataFrame to which KDE scores will be added.
                             KDE skorlarının ekleneceği DataFrame.

    Returns:
    - df (pandas.DataFrame): DataFrame with an added column for KDE scores.
                             KDE skorları eklenmiş DataFrame.
    """

    # Convert bandwidth and buffer from meters to degrees
    bandwidth_degrees = bandwidth_meters / 111000  # Convert bandwidth to degrees
    buffer_degrees = buffer_meters / 111000  # Convert buffer to degrees

    # Iterate over each kernel algorithm in the list
    for algorithm in kernel_algorithm:

        # Fit the KDE model with the specified kernel and bandwidth in degrees
        kde = KernelDensity(kernel=algorithm, bandwidth=bandwidth_degrees).fit(coords)

        # Define the grid boundaries with the buffer in degrees
        x_min, x_max = coords[:, 0].min() - buffer_degrees, coords[:, 0].max() + buffer_degrees
        y_min, y_max = coords[:, 1].min() - buffer_degrees, coords[:, 1].max() + buffer_degrees

        # Create a 2D grid for KDE evaluation
        x_grid, y_grid = np.meshgrid(
            np.linspace(x_min, x_max, resolution),  # X-axis grid points
            np.linspace(y_min, y_max, resolution)   # Y-axis grid points
        )

        # Flatten the grid into an array of points for KDE input
        grid_points = np.vstack([x_grid.ravel(), y_grid.ravel()]).T

        # Compute the log density on the grid points
        log_density = kde.score_samples(coords)

        # Convert log density to standard density
        density = np.exp(log_density)

        # Add KDE scores to the DataFrame
        column_name = f'KDE'
        df[column_name] = density

    return df

In [9]:
coords = df[['Start_Lng', 'Start_Lat']].to_numpy()

In [10]:
df = kdeAlgorithm(
    kernel_algorithm=['epanechnikov'],  # Gaussian çekirdeği kullan
    bandwidth_meters=500,  # 500 metre
    coords=coords,
    buffer_meters=1000,  # 1000 metre buffer
    resolution=100,
    df=df
)

In [11]:
df.to_csv("/content/drive/MyDrive/tubitak/datasets/preprocessed_final1.csv", index=False)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6622775 entries, 0 to 6622774
Data columns (total 28 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Start_Lat          float64
 2   Start_Lng          float64
 3   Distance(mi)       float64
 4   Temperature(F)     float64
 5   Humidity(%)        float64
 6   Pressure(in)       float64
 7   Visibility(mi)     float64
 8   Wind_Speed(mph)    float64
 9   Population         float64
 10  Month              int64  
 11  Day                int64  
 12  Weekday            int64  
 13  Hour               int64  
 14  Weather_Condition  object 
 15  Amenity            bool   
 16  Bump               bool   
 17  Crossing           bool   
 18  Give_Way           bool   
 19  Junction           bool   
 20  No_Exit            bool   
 21  Railway            bool   
 22  Roundabout         bool   
 23  Station            bool   
 24  Stop               bool   
 25  Traffic_Calming   

In [13]:
df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Population,...,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,KDE
0,3,39.865147,-84.058723,0.01,36.9,91.0,29.68,10.0,7.0,11640060.0,...,False,False,False,False,False,False,False,False,False,0.064641
1,2,39.928059,-82.831184,0.01,37.9,100.0,29.65,10.0,7.0,11640060.0,...,False,False,False,False,False,False,False,False,False,0.115248
2,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,3.5,11640060.0,...,False,False,False,False,False,False,False,False,True,0.051838
3,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,4.6,11640060.0,...,False,False,False,False,False,False,False,False,False,2.314023
4,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,3.5,11640060.0,...,False,False,False,False,False,False,False,False,True,0.354664


In [14]:
scaler = MinMaxScaler()
scaling_columns = ['Temperature(F)', 'Distance(mi)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
                   'Wind_Speed(mph)', 'Precipitation(in)', 'Start_Lng', 'Start_Lat',
                   'Month', 'Weekday', 'Day','Hour','KDE','Population']

scaling_columns = [col for col in scaling_columns if col in df.columns]

df[scaling_columns] = scaler.fit_transform(df[scaling_columns])

print("Feature scaling completed.")
df.head()

del scaling_columns

Feature scaling completed.


In [15]:
categorical_features = set([ "Weather_Condition"])

for cat in categorical_features:
    df[cat] = df[cat].astype("category")

print("Unique classes for each categorical feature:")
for cat in categorical_features:
    print("{:15s}".format(cat), "\t", len(df[cat].unique()))


Unique classes for each categorical feature:
Weather_Condition 	 12


In [16]:
# Encode Weather_Condition with one-hot encoding
df = pd.get_dummies(df, columns=['Weather_Condition'], prefix='Weather', drop_first=True)
print("Weather_Condition encoded with one-hot encoding.")

del categorical_features

Weather_Condition encoded with one-hot encoding.


In [17]:
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)

del bool_columns

df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Population,...,Weather_Cloudy,Weather_Fog,Weather_Hail,Weather_Rain,Weather_Sand,Weather_Smoke,Weather_Snow,Weather_Thunderstorm,Weather_Tornado,Weather_Windy
0,3,0.626257,0.705349,2.3e-05,0.425338,0.909091,0.506057,0.07103,0.00644,0.284194,...,0,0,0,1,0,0,0,0,0,0
1,2,0.62883,0.726694,2.3e-05,0.428716,1.0,0.505545,0.07103,0.00644,0.284194,...,0,0,0,1,0,0,0,0,0,0
2,2,0.593452,0.705803,2.3e-05,0.422297,1.0,0.505886,0.07103,0.00322,0.284194,...,1,0,0,0,0,0,0,0,0,0
3,3,0.621455,0.702796,2.3e-05,0.419257,0.959596,0.505375,0.063885,0.004232,0.284194,...,1,0,0,0,0,0,0,0,0,0
4,2,0.616547,0.703095,2.3e-05,0.422297,0.888889,0.505545,0.042447,0.00322,0.284194,...,1,0,0,0,0,0,0,0,0,0


In [19]:
severity_counts = df['Severity'].value_counts()
print("Before balancing:")
print(severity_counts)

Before balancing:
Severity
2    5098823
3    1282598
4     178324
1      63030
Name: count, dtype: int64


In [20]:
from imblearn.over_sampling import SMOTE

In [21]:
X_features = df.drop(columns=['Severity'])
y_target = df['Severity']

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_features, y_target)