In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn



In [None]:
import pandas as pd

# Load the CSV file
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

file_path = '/content/drive/MyDrive/AI PROJECT/train_collisions_india.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Year,Month,Day,Location,Casualties,Cause,Weather,Train Speed,Maintenance Status,Signal Error
0,1998,1,27,844,31,Technical Fault,Clear,88.2914,1,1
1,1961,6,23,387,40,Human Error,Clear,46.087071,0,0
2,2018,2,7,506,22,Weather Conditions,Rainy,30.153011,0,1
3,2007,7,15,294,31,Human Error,Stormy,72.348643,0,1
4,1967,9,14,193,28,Technical Fault,Clear,64.646958,0,0


In [None]:
# Check for missing values in each column
df.isnull().sum()

Unnamed: 0,0
Year,0
Month,0
Day,0
Location,0
Casualties,0
Cause,0
Weather,0
Train Speed,0
Maintenance Status,0
Signal Error,0


In [None]:
# Summary statistics
df.describe()

# Unique values in categorical columns
df['Cause'].unique()
df['Weather'].unique()
df['Maintenance Status'].unique()


array([1, 0])

In [None]:
# Example: Dropping rows with missing values
df.dropna(inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Example: Encoding categorical features
label_encoders = {}
for column in ['Location', 'Cause', 'Weather', 'Maintenance Status', 'Signal Error']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['Year', 'Month', 'Day', 'Casualties']))

# Create a new DataFrame with scaled features
df_scaled = pd.DataFrame(scaled_features, columns=df.columns.drop(['Year', 'Month', 'Day', 'Casualties']))
df_scaled['Casualties'] = df['Casualties'].values

In [None]:
from sklearn.model_selection import train_test_split

# Task 1: Predicting collisions at unmanned crossings
X_task1 = df_scaled.drop(columns=['Casualties'])  # Features
y_task1 = (df['Casualties'] > 0).astype(int)      # Binary classification (Collision or not)

# Task 2: Predicting collisions at malfunctioning barriers
X_task2 = df_scaled.drop(columns=['Casualties'])  # Features
y_task2 = (df['Signal Error'] > 0).astype(int)    # Binary classification (Signal error or not)

# Splitting into training and testing sets
X_train_task1, X_test_task1, y_train_task1, y_test_task1 = train_test_split(X_task1, y_task1, test_size=0.2, random_state=42)
X_train_task2, X_test_task2, y_train_task2, y_test_task2 = train_test_split(X_task2, y_task2, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
model_task1 = RandomForestClassifier(random_state=42)
model_task1.fit(X_train_task1, y_train_task1)

# Predict on test data
y_pred_task1 = model_task1.predict(X_test_task1)


In [None]:
from xgboost import XGBClassifier

# Initialize and train the model
model_task2 = XGBClassifier(random_state=42)
model_task2.fit(X_train_task2, y_train_task2)

# Predict on test data
y_pred_task2 = model_task2.predict(X_test_task2)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Task 1 Evaluation
accuracy_task1 = accuracy_score(y_test_task1, y_pred_task1)
print(f'Task 1 - Random Forest Accuracy: {accuracy_task1}')
print(confusion_matrix(y_test_task1, y_pred_task1))
print(classification_report(y_test_task1, y_pred_task1))

# Task 2 Evaluation
accuracy_task2 = accuracy_score(y_test_task2, y_pred_task2)
print(f'Task 2 - XGBoost Accuracy: {accuracy_task2}')
print(confusion_matrix(y_test_task2, y_pred_task2))
print(classification_report(y_test_task2, y_pred_task2))


Task 1 - Random Forest Accuracy: 1.0
[[20000]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     20000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Task 2 - XGBoost Accuracy: 1.0
[[17975     0]
 [    0  2025]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17975
           1       1.00      1.00      1.00      2025

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [None]:
from sklearn.model_selection import GridSearchCV

# Task 1 - Random Forest Tuning
param_grid_task1 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_task1 = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_task1, cv=5, scoring='accuracy')
grid_task1.fit(X_train_task1, y_train_task1)
best_model_task1 = grid_task1.best_estimator_

# Task 2 - XGBoost Tuning
param_grid_task2 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_task2 = GridSearchCV(XGBClassifier(random_state=42), param_grid_task2, cv=5, scoring='accuracy')
grid_task2.fit(X_train_task2, y_train_task2)
best_model_task2 = grid_task2.best_estimator_

# Evaluate tuned models
best_y_pred_task1 = best_model_task1.predict(X_test_task1)
best_y_pred_task2 = best_model_task2.predict(X_test_task2)

# New accuracy
print(f'Tuned Task 1 Accuracy: {accuracy_score(y_test_task1, best_y_pred_task1)}')
print(f'Tuned Task 2 Accuracy: {accuracy_score(y_test_task2, best_y_pred_task2)}')


Tuned Task 1 Accuracy: 1.0
Tuned Task 2 Accuracy: 1.0


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample data
data = {
    'Year': [2024, 2024, 2024, 2024, 2024],
    'Month': [1, 2, 3, 4, 5],
    'Day': [10, 15, 20, 25, 30],
    'Location': ['Crossing 1', 'Crossing 2', 'Crossing 3', 'Crossing 4', 'Crossing 5'],
    'Casualties': [0, 1, 0, 2, 3],
    'Cause': ['Vehicle Error', 'Signal Error', 'Vehicle Error', 'Signal Error', 'Vehicle Error'],
    'Weather': ['Clear', 'Rainy', 'Foggy', 'Clear', 'Rainy'],
    'Train Speed': [50, 60, 55, 70, 65],
    'Maintenance Status': ['Good', 'Poor', 'Good', 'Poor', 'Good'],
    'Signal Error': [0, 1, 0, 1, 0]
}

# Convert to DataFrame
test_df = pd.DataFrame(data)

# Select relevant features
features = ['Location', 'Cause', 'Weather', 'Train Speed', 'Maintenance Status', 'Signal Error']
test_features = test_df[features].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Preprocess the features
le_location = LabelEncoder()
le_cause = LabelEncoder()
le_weather = LabelEncoder()
le_maintenance_status = LabelEncoder()

test_features['Location'] = le_location.fit_transform(test_features['Location'])
test_features['Cause'] = le_cause.fit_transform(test_features['Cause'])
test_features['Weather'] = le_weather.fit_transform(test_features['Weather'])
test_features['Maintenance Status'] = le_maintenance_status.fit_transform(test_features['Maintenance Status'])

# Convert DataFrame types to ensure they are numeric
test_features = test_features.astype(float)

# Predict using the trained models (assuming best_model_task1 and best_model_task2 are trained models)
new_pred_task1 = best_model_task1.predict(test_features)
new_pred_task2 = best_model_task2.predict(test_features)

print("Predictions for Task 1:", new_pred_task1)
print("Predictions for Task 2:", new_pred_task2)


Predictions for Task 1: [1 1 1 1 1]
Predictions for Task 2: [0 0 0 0 0]


In [None]:
# Check the columns of test_df
print(test_df.columns)

Index(['Year', 'Month', 'Day', 'Location', 'Casualties', 'Cause', 'Weather',
       'Train Speed', 'Maintenance Status', 'Signal Error'],
      dtype='object')


In [None]:
import numpy as np

# Create sample target variables for demonstration
# Replace the logic here with your actual target generation logic
np.random.seed(0)  # For reproducibility
test_df['Target_Task1'] = np.random.choice([0, 1], size=len(test_df))  # Randomly assigning 0 or 1
test_df['Target_Task2'] = np.random.choice([0, 1], size=len(test_df))  # Randomly assigning 0 or 1


In [None]:
# Define the true target variables for evaluation
y_true_task1 = test_df['Target_Task1']  # Now this column should exist
y_true_task2 = test_df['Target_Task2']  # Now this column should exist


In [None]:
# Predictions made by your trained models
y_pred_task1 = best_model_task1.predict(test_features)
y_pred_task2 = best_model_task2.predict(test_features)

from sklearn.metrics import classification_report, confusion_matrix

# Classification report for Task 1
print("Classification Report for Task 1:")
print(classification_report(y_true_task1, y_pred_task1))

# Confusion matrix for Task 1
print("Confusion Matrix for Task 1:")
print(confusion_matrix(y_true_task1, y_pred_task1))

# Classification report for Task 2
print("Classification Report for Task 2:")
print(classification_report(y_true_task2, y_pred_task2))

# Confusion matrix for Task 2
print("Confusion Matrix for Task 2:")
print(confusion_matrix(y_true_task2, y_pred_task2))


Classification Report for Task 1:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.60      1.00      0.75         3

    accuracy                           0.60         5
   macro avg       0.30      0.50      0.37         5
weighted avg       0.36      0.60      0.45         5

Confusion Matrix for Task 1:
[[0 2]
 [0 3]]
Classification Report for Task 2:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       5.0

    accuracy                           0.00       5.0
   macro avg       0.00      0.00      0.00       5.0
weighted avg       0.00      0.00      0.00       5.0

Confusion Matrix for Task 2:
[[0 0]
 [5 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Check the distribution of target classes in the training dataset
print("Task 1 Target Class Distribution:")
print(train_df['Target_Task1'].value_counts())

print("Task 2 Target Class Distribution:")
print(train_df['Target_Task2'].value_counts())


Task 1 Target Class Distribution:


NameError: name 'train_df' is not defined