In [None]:
# Step 1: Import Required Libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Step 2: Read the Contents of the File

file_path = '/Users/keshavsaraogi/Desktop/freight/And_05min_1303_0606_2021.csv'
df = pd.read_csv(file_path)

print(df.info())
print(df.head())
print(df.describe())
print(df.isnull().sum())
print(df.columns)

In [None]:
# Step 3: Rename and clean columns
df.rename(columns={'2021-04-15 07:25:00': 'timestamp'}, inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])

df.rename(columns={'148.0': 'traffic_volume', '1': 'station_id', '33.0': 'traffic_speed'}, inplace=True)
print(df.columns)

In [None]:
# Step 4: Feature Engineering - Extract temporal features from 'timestamp'
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year

In [None]:
# Step 5: Rolling features
df['traffic_volume_rolling'] = df['traffic_volume'].rolling(window=5).mean()

# Step 6: Create lag features
df['traffic_volume_lag_1'] = df['traffic_volume'].shift(1)
df['traffic_volume_lag_2'] = df['traffic_volume'].shift(2)

# Drop any rows with NaN values
df.dropna(inplace=True)


In [None]:
# Step 7: Define traffic condition based on traffic volume
df['traffic_condition'] = pd.cut(df['traffic_volume'], bins=[0, 500, 1000, 2000, 3000], labels=['Low', 'Medium', 'High', 'Very High'])

In [None]:

# Step 8: Define 'time_of_day' based on the hour
def time_of_day(hour):
    if 6 <= hour < 9:
        return 'Morning Rush'
    elif 9 <= hour < 17:
        return 'Daytime'
    elif 17 <= hour < 20:
        return 'Evening Rush'
    else:
        return 'Night'

df['time_of_day'] = df['hour'].apply(time_of_day)

In [None]:
# Step 9: Feature for weekend or weekday
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

In [None]:
# Step 10: Label Encoding for categorical variables
encoder = LabelEncoder()

# Encode 'station_id' and 'time_of_day'
df['station_id_encoded'] = encoder.fit_transform(df['station_id'])
df['time_of_day_encoded'] = encoder.fit_transform(df['time_of_day'])

In [None]:
# Step 11: Prepare features (X) and target (y)
X = df.drop(columns=['timestamp', 'traffic_condition', 'time_of_day'])
y = df['traffic_condition']

In [None]:
# Step 12: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if `y_train` contains any missing or invalid values
print(f"Missing values in y_train: {y_train.isnull().sum()}")
print(f"Unique values in y_train before encoding: {y_train.unique()}")

# Ensure that the target variable is encoded as numeric
y_encoder = LabelEncoder()
y_train = y_encoder.fit_transform(y_train)
y_test = y_encoder.transform(y_test)

In [None]:
# Step 13: Standardize the continuous features
scaler = StandardScaler()
continuous_features = ['traffic_volume', 'traffic_speed', 'traffic_volume_rolling', 'traffic_volume_lag_1', 'traffic_volume_lag_2']
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

In [None]:
# Step 14: Refit the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Step 15: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))