In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv(
    "construction_features_timeseries.csv",
    parse_dates=["Date"]
)

df.head()


Unnamed: 0,Date,Project_ID,Planned_Cost,Actual_Cost,Planned_Progress,Actual_Progress,Vibration_Level,Crack_Width,Temperature,Humidity,...,Labor_Hours_lag2,Cost_rolling_3,Progress_rolling_3,Cost_Deviation,Progress_Delay,Cost_per_LaborHour,Cost_per_Equipment,High_Cost_Overrun_Flag,High_Progress_Delay_Flag,Risk_Level_Encoded
0,2020-01-19,PJT_1,275523.235955,362620.7,2.247191,1.11069,1.453852,2.814901,18.024173,40.071256,...,69.805903,2083.054318,-2.333298,87097.447593,1.136501,5325.484754,4668.222231,0,0,2
1,2020-01-26,PJT_1,413284.853933,579147.8,3.370787,3.294323,1.533286,2.851868,17.91687,54.675666,...,80.584834,330962.667409,-1.009267,165862.972445,0.076464,8207.746424,7318.91439,0,0,2
2,2020-02-02,PJT_1,551046.47191,989660.4,4.494382,1.397148,1.349615,2.827027,21.440585,48.549559,...,67.091582,643809.651346,1.934054,438613.972204,3.097234,16467.072419,13553.764934,0,1,2
3,2020-02-09,PJT_1,688808.089888,598507.9,5.617978,6.089291,1.545437,2.731789,20.602878,64.76664,...,69.561126,722438.721091,3.593587,-90300.197104,-0.471313,7520.381475,7830.375585,0,0,2
4,2020-02-16,PJT_1,826569.707865,1287894.0,6.741573,1.675068,1.700409,2.826256,20.961353,58.843743,...,59.099356,958687.373153,3.053836,461324.074696,5.066505,17011.420209,16825.369158,0,1,2


In [3]:
y = df["Risk_Level_Encoded"]


In [4]:
feature_cols = [
    "Actual_Cost",
    "Cost_Deviation",
    "Progress_Delay",
    "Cost_per_LaborHour",
    "Equipment_Utilization",
    "Vibration_Level",
    "Crack_Width",
    "Labor_Hours",
    "High_Cost_Overrun_Flag",
    "High_Progress_Delay_Flag",
    "Actual_Cost_lag1",
    "Actual_Progress_lag1"
]

X = df[feature_cols]


In [8]:
split = int(0.8 * len(df))

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]


In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)


In [10]:
print("Logistic Regression Results")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Results
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2448
           1       0.30      0.11      0.16      4606
           2       0.54      0.89      0.67      8351

    accuracy                           0.52     15405
   macro avg       0.28      0.33      0.28     15405
weighted avg       0.38      0.52      0.41     15405



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [None]:
print("Random Forest Results")
print(classification_report(y_test, y_pred_rf))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Risk Prediction Confusion Matrix")
plt.show()


In [None]:
importances = pd.Series(
    rf.feature_importances_,
    index=feature_cols
).sort_values(ascending=False)

importances.plot(kind="bar", figsize=(10,4), title="Feature Importance")
plt.show()
