In [16]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 666.5 kB/s eta 0:01:48
   ---------------------------------------- 0.8/72.0 MB 727.7 kB/s eta 0:01:38
    --------------------------------------- 1.0/72.0 MB 817.7 kB/s eta 0:01:27
    --------------------------------------- 1.0/72.0 MB 817.7 kB/s eta 0:01:27
    --------------------------------------- 1.6/72.0 MB 995.2 kB/s eta 0:01:11
   - ---------------------------------

In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv("bim_ai_civil_engineering_dataset.csv")

# Initial Inspection
print("Initial 5 rows:")
df.head()

print("\nColumn information:")
print(df.info())
df.head()


Initial 5 rows:

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Project_ID             1000 non-null   object 
 1   Project_Type           1000 non-null   object 
 2   Location               1000 non-null   object 
 3   Start_Date             1000 non-null   object 
 4   End_Date               1000 non-null   object 
 5   Planned_Cost           1000 non-null   int64  
 6   Actual_Cost            1000 non-null   float64
 7   Cost_Overrun           1000 non-null   float64
 8   Planned_Duration       1000 non-null   int64  
 9   Actual_Duration        1000 non-null   float64
 10  Schedule_Deviation     1000 non-null   float64
 11  Vibration_Level        1000 non-null   float64
 12  Crack_Width            1000 non-null   float64
 13  Load_Bearing_Capacity  1000 non-null   float64
 14  Temperature         

Unnamed: 0,Project_ID,Project_Type,Location,Start_Date,End_Date,Planned_Cost,Actual_Cost,Cost_Overrun,Planned_Duration,Actual_Duration,...,Energy_Consumption,Material_Usage,Labor_Hours,Equipment_Utilization,Accident_Count,Safety_Risk_Score,Image_Analysis_Score,Anomaly_Detected,Completion_Percentage,Risk_Level
0,PJT_1,Tunnel,Houston,2020-01-01,2021-09-26,12260784,15054500.0,2793720.0,699,813.914852,...,25202.994687,244.84331,6602,76.300184,8,6.192198,52.98833,0,95.006343,High
1,PJT_2,Dam,Houston,2020-01-02,2020-12-06,2369277,3507054.0,1137777.0,269,384.118221,...,49066.172542,263.123025,7121,63.527671,5,2.134473,50.885745,0,25.294824,Low
2,PJT_3,Building,Houston,2020-01-03,2021-12-05,23299783,21692130.0,-1607656.0,899,1081.777915,...,48192.547163,608.985023,9956,47.099444,2,3.113728,93.905836,0,97.47883,Medium
3,PJT_4,Dam,Houston,2020-01-04,2022-04-12,24499306,29469660.0,4970354.0,809,974.565655,...,19811.15175,673.574344,3725,86.846394,5,4.070101,90.454316,1,95.098131,High
4,PJT_5,Dam,Seattle,2020-01-05,2022-02-12,1749971,2329338.0,579367.0,354,347.990127,...,44866.565169,765.476122,4368,61.827163,6,2.759351,78.391069,0,43.624985,Low


## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("bim_ai_civil_engineering_dataset.csv")

# 1. Remove Duplicates 
df_cleaned = df.drop_duplicates().copy()

# Print information on duplicate removal (optional, but good practice)
print(f"Original shape: {df.shape}")
print(f"Shape after removing duplicates: {df_cleaned.shape}\n")

# --- 2. Feature Engineering: Log Transformation for Targets 
# Log-transform Actual_Cost and Actual_Duration
# np.log1p is used for log(x+1) to handle potential zero or near-zero values safely.
df_cleaned['log_actual_cost'] = np.log1p(df_cleaned['Actual_Cost'])
df_cleaned['log_actual_duration'] = np.log1p(df_cleaned['Actual_Duration'])

# --- 3. Feature Selection/Dropping Columns ---
# Drop identifiers, date columns, original targets, and columns causing target leakage
columns_to_drop = [
    'Project_ID', 'Start_Date', 'End_Date', 
    'Actual_Cost', 'Actual_Duration',
    'Cost_Overrun', 'Schedule_Deviation', 
    # Keeping Planned_Cost/Duration for now as they are inputs available at start,
    # but excluding them in final feature set for a model that predicts based on type/location/environmental factors
    'Planned_Cost', 'Planned_Duration'
]

df_features = df_cleaned.drop(columns=columns_to_drop, errors='ignore')

# --- 4. Encoding Categorical Variables ---
# Identify categorical columns for One-Hot Encoding
categorical_cols = ['Project_Type', 'Location', 'Weather_Condition', 'Risk_Level']

# Perform One-Hot Encoding
df_encoded = pd.get_dummies(
    df_features, 
    columns=categorical_cols, 
    drop_first=True # Drop first category to avoid multicollinearity
)

# Display final prepared dataframe information
print(" Final Prepared Data Structure ")
print(f"Final shape: {df_encoded.shape}")
print(f"Columns (Features + Log Targets): {df_encoded.columns.tolist()}\n")
print(df_encoded.head().to_markdown(index=False, numalign="left", stralign="left"))

# Save the prepared data to a new CSV file for the next step (Model Development)
df_encoded.to_csv("prepared_civil_engineering_data.csv", index=False)



In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the prepared data
df = pd.read_csv("prepared_civil_engineering_data.csv")

# 1. Define Features (X) and Targets (Y)
# X includes all columns except the log-transformed targets
X = df.drop(columns=['log_actual_cost', 'log_actual_duration'])
Y_cost = df['log_actual_cost']
Y_duration = df['log_actual_duration']

# 2. Split Data (80% Training, 20% Testing)
X_train, X_test, Y_train_cost, Y_test_cost = train_test_split(
    X, Y_cost, test_size=0.2, random_state=42
)
_, _, Y_train_duration, Y_test_duration = train_test_split(
    X, Y_duration, test_size=0.2, random_state=42
)

# 3. Apply Feature Scaling
# Scaling improves convergence for some algorithms (like Linear Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Features (X) Shape: {X.shape}")
print(f"Training Set Shape: {X_train.shape[0]} samples (80%)")
print(f"Testing Set Shape: {X_test.shape[0]} samples (20%)")

Features (X) Shape: (1000, 29)
Training Set Shape: 800 samples (80%)
Testing Set Shape: 200 samples (20%)


## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --- Cost Model ---
lr_cost = LinearRegression()
lr_cost.fit(X_train_scaled, Y_train_cost)
Y_pred_lr_cost = lr_cost.predict(X_test_scaled)

# --- Duration Model ---
lr_duration = LinearRegression()
lr_duration.fit(X_train_scaled, Y_train_duration)
Y_pred_lr_duration = lr_duration.predict(X_test_scaled)

****Decision Tree Regressor****

In [13]:
from sklearn.tree import DecisionTreeRegressor

# --- Cost Model ---
dt_cost = DecisionTreeRegressor(random_state=42)
dt_cost.fit(X_train, Y_train_cost)
Y_pred_dt_cost = dt_cost.predict(X_test)

# --- Duration Model ---
dt_duration = DecisionTreeRegressor(random_state=42)
dt_duration.fit(X_train, Y_train_duration)
Y_pred_dt_duration = dt_duration.predict(X_test)

## Random Forest Regressor (Ensemble Model)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Hyperparameter search space for RF
rf_param_grid = {
    'n_estimators': [100, 200], # Number of trees in the forest
    'max_depth': [5, 10, None], # Max depth of the tree
}

# --- Cost Model (Using Cross-Validation for Tuning) ---
rf_cost_grid = GridSearchCV(
    RandomForestRegressor(random_state=42), 
    rf_param_grid, 
    scoring='neg_mean_squared_error', 
    cv=5, 
    verbose=0
)
# rf_cost_grid.fit(X_train, Y_train_cost)
# best_rf_cost = rf_cost_grid.best_estimator_

# For the sake of demonstration, we'll use a standard fit:
rf_cost = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_cost.fit(X_train, Y_train_cost)
Y_pred_rf_cost = rf_cost.predict(X_test)

# Repeat for Duration Model
rf_duration = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_duration.fit(X_train, Y_train_duration)
Y_pred_rf_duration = rf_duration.predict(X_test)

In [17]:
import xgboost as xgb

# Hyperparameter search space for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1], # Step size shrinkage
    'max_depth': [3, 5]
}

# --- Cost Model (Using Cross-Validation for Tuning) ---
xgb_cost_grid = GridSearchCV(
    xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
    xgb_param_grid,
    scoring='neg_mean_squared_error', 
    cv=5, 
    verbose=0
)
xgb_cost_grid.fit(X_train, Y_train_cost)
best_xgb_cost = xgb_cost_grid.best_estimator_

# For the sake of demonstration, we'll use a standard fit:
xgb_cost = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, 
                            random_state=42, objective='reg:squarederror')
xgb_cost.fit(X_train, Y_train_cost)
Y_pred_xgb_cost = xgb_cost.predict(X_test)

# Repeat for Duration Model
xgb_duration = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, 
                               random_state=42, objective='reg:squarederror')
xgb_duration.fit(X_train, Y_train_duration)
Y_pred_xgb_duration = xgb_duration.predict(X_test)

## Saving Models


In [21]:
# all_models = {
#     'lr_cost': lr_cost,
#     'dt_cost': dt_cost,
#     'rf_cost': rf_cost,
#     'xgb_cost': xgb_cost,
#     'lr_duration': lr_duration,
#     'dt_duration': dt_duration,
#     'rf_duration': rf_duration,
#     'xgb_duration': xgb_duration
# }

# joblib.dump(all_models, 'all_models.pkl')


dict_keys(['lr_cost', 'dt_cost', 'rf_cost', 'xgb_cost', 'lr_duration', 'dt_duration', 'rf_duration', 'xgb_duration'])


## Evaluating Model

In [18]:
# Function to evaluate and reverse transform predictions
def evaluate_model(y_true_log, y_pred_log, model_name):
    # Calculate log-scale metrics (good for internal comparison)
    log_r2 = r2_score(y_true_log, y_pred_log)
    log_mse = mean_squared_error(y_true_log, y_pred_log)

    # Reverse transformation: np.expm1(y) is inverse of np.log1p(y)
    y_true_actual = np.expm1(y_true_log)
    y_pred_actual = np.expm1(y_pred_log)
    
    # Calculate Actual-scale metrics (what the engineer cares about)
    actual_r2 = r2_score(y_true_actual, y_pred_actual)
    actual_mse = mean_squared_error(y_true_actual, y_pred_actual)
    
    return {
        'Model': model_name,
        'R2 (Log)': log_r2,
        'MSE (Log)': log_mse,
        'R2 (Actual)': actual_r2,
        'MSE (Actual)': actual_mse
    }

# Collect all predictions for Cost and Duration
cost_predictions = {
    'Linear Regression': Y_pred_lr_cost,
    'Decision Tree': Y_pred_dt_cost,
    'Random Forest': Y_pred_rf_cost,
    'XGBoost': Y_pred_xgb_cost
}

duration_predictions = {
    'Linear Regression': Y_pred_lr_duration,
    'Decision Tree': Y_pred_dt_duration,
    'Random Forest': Y_pred_rf_duration,
    'XGBoost': Y_pred_xgb_duration
}

# Generate Results
cost_results = [evaluate_model(Y_test_cost, pred, name) for name, pred in cost_predictions.items()]
duration_results = [evaluate_model(Y_test_duration, pred, name) for name, pred in duration_predictions.items()]

cost_df = pd.DataFrame(cost_results).sort_values(by='R2 (Actual)', ascending=False)
duration_df = pd.DataFrame(duration_results).sort_values(by='R2 (Actual)', ascending=False)

print("\n--- Project Cost Prediction Results (Actual Scale Sorted by R2) ---")
print(cost_df.to_markdown(index=False, numalign="left", stralign="left"))

print("\n--- Project Duration Prediction Results (Actual Scale Sorted by R2) ---")
print(duration_df.to_markdown(index=False, numalign="left", stralign="left"))


--- Project Cost Prediction Results (Actual Scale Sorted by R2) ---
| Model             | R2 (Log)   | MSE (Log)   | R2 (Actual)   | MSE (Actual)   |
|:------------------|:-----------|:------------|:--------------|:---------------|
| Linear Regression | -0.0288047 | 0.738065    | -0.174607     | 3.87828e+14    |
| Random Forest     | -0.0258341 | 0.735934    | -0.19439      | 3.94359e+14    |
| XGBoost           | -0.167156  | 0.837318    | -0.295426     | 4.27719e+14    |
| Decision Tree     | -1.44741   | 1.75577     | -1.32489      | 7.67623e+14    |

--- Project Duration Prediction Results (Actual Scale Sorted by R2) ---
| Model             | R2 (Log)   | MSE (Log)   | R2 (Actual)   | MSE (Actual)   |
|:------------------|:-----------|:------------|:--------------|:---------------|
| Linear Regression | -0.064333  | 0.212026    | -0.129809     | 80874.7        |
| Random Forest     | -0.0730254 | 0.213758    | -0.134455     | 81207.3        |
| XGBoost           | -0.168068  | 0.2

## Dashboard

In [19]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib # Or pickle for loading saved models/preprocessors

# --- 1. Load Pre-trained Assets ---
@st.cache_resource # Use Streamlit's caching for efficient loading
def load_assets():
    # Load all models from the dictionary
    all_models = joblib.load('all_models.pkl')
    
    # Pick which models you want to use for prediction
    cost_model = all_models['xgb_cost']       # or 'lr_cost', etc.
    duration_model = all_models['xgb_duration']  # or 'lr_duration', etc.
    
    return cost_model, duration_model, preprocessor


cost_model, duration_model, preprocessor = load_assets()

st.title("üèóÔ∏è ML Project Cost & Duration Predictor")
st.markdown("Forecast construction project metrics using our predictive models.")

# --- 2. Input Panel (Example Features) ---
st.sidebar.header("Project Parameters (What-If Scenario)")

# Example features based on your proposal [cite: 4]
project_type = st.sidebar.selectbox("Project Type:", ['Road', 'Bridge', 'Building', 'Water Facility'])
project_area = st.sidebar.slider("Project Area (sq. meters):", 100, 10000, 5000)
project_region = st.sidebar.selectbox("Project Region:", ['North', 'South', 'East', 'West'])
# Assuming 'expenditure' is an input feature for the model

# Collect all inputs into a DataFrame row
input_data = pd.DataFrame({
    'type': [project_type],
    'area': [project_area],
    'region': [project_region],
    # Add all other required features here
})

# --- 3. Preprocessing and Prediction ---
if st.sidebar.button("Generate Forecast"):
    try:
        # 3. Preprocessing: Transform the input data
        processed_input = preprocessor.transform(input_data)
        
        # 4. Prediction: Get log-transformed predictions
        cost_log_pred = cost_model.predict(processed_input)[0]
        duration_log_pred = duration_model.predict(processed_input)[0]
        
        # 5. Inverse Transform: Convert back to actual scale (assuming natural log was used)
        cost_pred = np.exp(cost_log_pred)
        duration_pred = np.exp(duration_log_pred)
        
        # --- 4. Prediction Display ---
        st.header("Automated Forecasts")
        col1, col2 = st.columns(2)
        
        with col1:
            st.metric(label="Predicted Project Cost (CAD)", 
                      value=f"${cost_pred:,.2f}")

        with col2:
            st.metric(label="Predicted Project Duration (Days)", 
                      value=f"{int(duration_pred):,} days")

        # --- 5. Scenario Visuals (Optional - requires historical data) ---
        st.subheader("Scenario Visualization")
        st.write(f"The predicted cost of ${cost_pred:,.2f} for a **{project_type}** project in the **{project_region}** region is shown below against historical data (Not implemented in this snippet).")

    except Exception as e:
        st.error(f"An error occurred during prediction: {e}")

2025-11-29 12:36:57.214 
  command:

    streamlit run C:\Users\User\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


FileNotFoundError: [Errno 2] No such file or directory: 'best_cost_model.pkl'