# Introduction to Data Science Project
-----------------------------------------------------------------------

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("traffic_accidents.csv")
df.head(2)
#print("\nDataset Info:")
#print(df.info())

## Step 2:  Exploratory Data Analysis (EDA)

### i) Missing Value Analysis

In [None]:
# Missing Value Analysis
missing_values = df.isnull().sum()
print('Missing Value Analysis')
missing_values

In [None]:
# to remove duplicates
df.drop_duplicates(inplace=True)


### ii) Summary Statistics (mean, median, mode, etc.)


In [None]:
numerical_stats = df.select_dtypes(include=['int64', 'float64']).describe()
print("Summary Statistics for Numerical Features:")
print(numerical_stats)


In [None]:
# Mode for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nMode for Categorical Features:")
for col in categorical_cols:
    print(f"{col}: {df[col].mode()[0]}")

### iii) Visualiztaion

In [None]:
# Histogram for numerical features
df.select_dtypes(include=['int64', 'float64']).hist(figsize=(12, 8))
plt.tight_layout()
#plt.savefig('numerical_histograms.png')
plt.show()


### iv) Correlation Analysis

In [None]:

#Correlation Analysis
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
#plt.savefig('correlation_matrix.png')
plt.show()
plt.close()


### v) Outlier Detection

In [None]:
sns.boxplot(data=df[['injuries_total', 'injuries_fatal']])
plt.title("Outlier Detection")
plt.show()


### vi)  Feature Distribution Analysis

In [None]:
df['crash_hour'].hist(bins=24)
plt.title("Crashes by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Frequency")
plt.show()

### vii) Data Types and Unique Value Counts

In [None]:
#Data Types
df.dtypes

In [None]:
#Unique Value Counts
df.nunique()

### viii) Trend Analysis

In [None]:
df['crash_date'] = pd.to_datetime(df['crash_date'], errors='coerce')
df['crash_date'].dt.date.value_counts().sort_index().plot()
plt.title("Crashes Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Crashes")
plt.show()

### ix) Grouped Aggregations

In [None]:
df.groupby('weather_condition')['injuries_total'].mean().sort_values().plot(kind='barh')
plt.title("Avg Injuries by Weather Condition")
plt.show()

### x) Pairwise Feature Relationships

In [None]:
sns.pairplot(df[['injuries_total', 'injuries_fatal', 'num_units']])

### xi) Other Relevant Analyses 

In [None]:
x_axis = 'crash_hour'
stack_by = 'trafficway_type'

# Filter to top 5 most common trafficway types
top_categories = df[stack_by].value_counts().index[:5]
df_filtered = df[df[stack_by].isin(top_categories)]

# Split into AM and PM data
df_am = df_filtered[df_filtered[x_axis] < 12]
df_pm = df_filtered[df_filtered[x_axis] >= 12]

# Creating pivot tables for stacked bar charts
pivot_am = df_am.pivot_table(index=x_axis, columns=stack_by, aggfunc='size', fill_value=0)
pivot_pm = df_pm.pivot_table(index=x_axis, columns=stack_by, aggfunc='size', fill_value=0)


fig, axes = plt.subplots(1, 2, figsize=(20, 7))

# AM chart
pivot_am.plot(kind='bar', stacked=True, cmap='Blues', ax=axes[0])
axes[0].set_title('AM Accidents by Hour and Trafficway Type')
axes[0].set_xlabel('Hour (0–11)')
axes[0].set_ylabel('Accident Count')
axes[0].tick_params(axis='x', rotation=0)
axes[0].legend(title='Trafficway Type', loc='upper left')

# PM chart
pivot_pm.plot(kind='bar', stacked=True, cmap='Oranges', ax=axes[1])
axes[1].set_title('PM Accidents by Hour and Trafficway Type')
axes[1].set_xlabel('Hour (12–23)')
axes[1].set_ylabel('Accident Count')
axes[1].tick_params(axis='x', rotation=0)
axes[1].legend(title='Trafficway Type', loc='upper right')


plt.tight_layout()
plt.show()


In [None]:
# pie chart for top 5 Crash types
plt.figure(figsize=(4, 6))
df['first_crash_type'].value_counts().head(5).plot(kind='pie', autopct='%1.1f%%')
plt.title('Top 5 crash type in Accidents')
#plt.savefig('crash_piechart.png')
plt.show()
plt.close()

In [None]:
# Comparing number of units involved and total injuries
plt.scatter(df['crash_hour'], df['injuries_total'], color='orange', alpha=0.6)
plt.xlabel('Crash Hour')
plt.ylabel('Total Injuries')
plt.title('Scatter Plot: Crash Hour vs Total Injuries')
plt.grid(True)
plt.show()


In [None]:
#Average Injuries by Lighting Condition
avg_injuries_by_light = df.groupby('lighting_condition')['injuries_total'].mean().sort_values()
plt.figure(figsize=(10, 6))
avg_injuries_by_light.plot(kind='barh', color='skyblue')
plt.xlabel("Average Injuries")
plt.title("Average Injuries by Lighting Condition")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
#Distribution of Injuries by Weather Condition
plt.figure(figsize=(12, 6))
sns.violinplot(x='weather_condition', y='injuries_total', data=df, inner='quartile', palette='Set3')
plt.xticks(rotation=45)
plt.title("Distribution of Injuries by Weather Condition (Violin Plot)")
plt.xlabel("Weather Condition")
plt.ylabel("Injuries Total")
plt.tight_layout()
plt.show()

-----------------------------------------------



##  Step 3: Data Preprocessing

### i) Handle Missing Values

In [None]:
df = pd.read_csv("traffic_accidents.csv")
df.head(2)
df.drop_duplicates(inplace=True)
df.dropna()

### ii) Encode Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder
# Extract suffix from crash_type
df['crash_type'] = df['crash_type'].str.split('/').str[-1].str.strip()

df1 = df.drop(['crash_date','injuries_fatal', 'injuries_incapacitating', 
              'injuries_non_incapacitating', 'injuries_reported_not_evident','injuries_no_indication'], axis=1)
label_encoders = {}
cat_cols = df1.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col].astype(str))
    label_encoders[col] = le 
df1

### iii) Normalize/Scale Numerical Features

In [None]:
from sklearn.preprocessing import StandardScaler

num_cols = df1.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('injuries_total')  
scaler = StandardScaler()
df1[num_cols] = scaler.fit_transform(df1[num_cols])
df1

### iv)  Split Into Train/Test

In [None]:
from sklearn.model_selection import train_test_split

X = df1.drop('injuries_total', axis=1)
y = df1['injuries_total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=49)



-----------------------------------------------------------------------------------------------

# Step 4: Machine Learning Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")
