In [35]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pickle

In [36]:
# Material consumption rates (units/m²) - Nigeria standards
MATERIAL_RATES = {
    'Residential': {'cement': 0.5, 'blocks': 10, 'steel': 0.025, 'sand': 0.05, 'granite': 0.03},
    'Commercial': {'cement': 0.7, 'blocks': 15, 'steel': 0.04, 'sand': 0.07, 'granite': 0.05},
    'Industrial': {'cement': 1.0, 'blocks': 20, 'steel': 0.06, 'sand': 0.10, 'granite': 0.08}
}

In [37]:
SHAPE_COMPLEXITY = {
    'Rectangular': 1.0,
    'Square': 1.00,
    'L-Shaped': 1.15,
    'U-Shaped': 1.2,
    'Circular': 1.25
}

PLANT_RATES = {
    'Residential': 0.08,  # 8% of material + labor cost
    'Commercial': 0.12,   # 12%
    'Industrial': 0.18    # 18%
}

# Labor productivity rates (hours/m²) for different building types
LABOR_PRODUCTIVITY_RATES = {
    'Residential': {
        'foundation': 1.2,
        'structural': 2.0,
        'finishing': 1.5
    },
    'Commercial': {
        'foundation': 1.5,
        'structural': 2.5,
        'finishing': 2.0
    },
    'Industrial': {
        'foundation': 2.0,
        'structural': 3.0,
        'finishing': 2.5
    }
}

# Labor efficiency factors
EFFICIENCY_FACTORS = {
    'crew_size': {
        'Small': 0.9,    # Less than 15 workers
        'Medium': 1.0,   # 15-30 workers
        'Large': 1.1     # More than 30 workers
    },
    'weather': {
        'Good': 1.0,
        'Average': 0.9,
        'Poor': 0.8
    }
}

RISK_FACTORS = {'prelim': 0.05, 'cont': 0.075, 'design': 0.02}
CIRCULATION_SPACE = 0.20
STANDARD_WORK_HOURS = 8
OVERHEAD_PROFIT = 0.15

In [38]:
def calculate_gfa(length, breadth, storeys, shape):
    base_area = length * breadth * storeys
    circulation_deduction = base_area * CIRCULATION_SPACE
    shape_multiplier = SHAPE_COMPLEXITY[shape]
    return (base_area + circulation_deduction) * shape_multiplier

In [39]:
def calculate_labor_hours(gfa, building_type, crew_size, weather_condition):
    # Calculate base labor hours
    base_hours = sum([
        LABOR_PRODUCTIVITY_RATES[building_type]['foundation'],
        LABOR_PRODUCTIVITY_RATES[building_type]['structural'],
        LABOR_PRODUCTIVITY_RATES[building_type]['finishing']
    ]) * gfa
    
    # Apply efficiency factors
    crew_efficiency = EFFICIENCY_FACTORS['crew_size']['Medium' if 15 <= crew_size <= 30 
                                                     else 'Large' if crew_size > 30 
                                                     else 'Small']
    weather_efficiency = EFFICIENCY_FACTORS['weather'][weather_condition]
    
    return base_hours / (crew_efficiency * weather_efficiency)

In [40]:
def calculate_features(df):
    # Calculate labor hours and costs
    df['Labor_Hours'] = df.apply(lambda x: calculate_labor_hours(
        x['GFA'],
        x['Type'],
        x['Workers'],
        x['Weather']
    ), axis=1)
    
    # Calculate labor cost including overhead and profit
    df['Labor_Cost'] = df['Labor_Hours'] * df['Labor_Rate'] * (1 + OVERHEAD_PROFIT)
    
    # Material costs
    df['Material_Cost'] = df.apply(lambda x: sum([
        x['Cement_Price'] * MATERIAL_RATES[x['Type']]['cement'] * x['GFA'],
        x['Block_Price'] * MATERIAL_RATES[x['Type']]['blocks'] * x['GFA'],
        x['Steel_Price'] * MATERIAL_RATES[x['Type']]['steel'] * x['GFA'],
        x['Sand_Price'] * MATERIAL_RATES[x['Type']]['sand'] * x['GFA'],
        x['Granite_Price'] * MATERIAL_RATES[x['Type']]['granite'] * x['GFA']
    ]), axis=1)
    
    # Plant/machinery costs
    df['Plant_Cost'] = (df['Material_Cost'] + df['Labor_Cost']) * df['Type'].map(PLANT_RATES)
    
    # Feature engineering
    df['Shape_Complexity'] = df['Shape'].map(SHAPE_COMPLEXITY)
    df['Vertical_Complexity'] = df['Storeys'] * 0.25
    df['Site_Difficulty'] = df['Soil'].map({'Rocky':1.5, 'Sandy':1.2, 'Clay':1.0}) \
                           * df['Access'].map({'Poor':1.4, 'Average':1.1, 'Good':1.0})
    
    # Productivity metrics
    df['Labor_Productivity'] = df['GFA'] / (df['Labor_Hours'] * df['Workers'])
    
    # Final costs
    base_cost = df['Material_Cost'] + df['Labor_Cost'] + df['Plant_Cost']
    df['Total_Cost'] = base_cost * (1 + sum(RISK_FACTORS.values())) \
                      * df['Location'].map({'Urban':1.1, 'Suburban':1.05, 'Rural':1.0}) \
                      * df['Site_Difficulty'] \
                      * df['Shape_Complexity']
    
    # Duration calculation (in days)
    df['Duration'] = df['Labor_Hours'] / (df['Workers'] * STANDARD_WORK_HOURS)
    df['Duration'] *= df['Site_Difficulty']  # Adjust for site conditions
    
    return df

# Generate synthetic data
np.random.seed(42)
n_samples = 15000

data = {
    'Length': np.random.randint(15, 50, n_samples),
    'Breadth': np.random.randint(10, 40, n_samples),
    'Storeys': np.random.randint(0, 11, n_samples),
    'Shape': np.random.choice(list(SHAPE_COMPLEXITY.keys()), n_samples),
    'Type': np.random.choice(list(MATERIAL_RATES.keys()), n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'Soil': np.random.choice(['Rocky', 'Sandy', 'Clay'], n_samples),
    'Access': np.random.choice(['Good', 'Average', 'Poor'], n_samples),
    'Weather': np.random.choice(['Good', 'Average', 'Poor'], n_samples),
    'Cement_Price': np.random.randint(9500, 10500, n_samples),
    'Block_Price': np.random.randint(500, 750, n_samples),
    'Steel_Price': np.random.randint(700000, 1250000, n_samples),
    'Sand_Price': np.random.randint(30000, 45000, n_samples),
    'Granite_Price': np.random.randint(19000, 21000, n_samples),
    'Labor_Rate': np.random.randint(1000, 2500, n_samples),  # Labor rate per hour
    'Workers': np.random.randint(10, 50, n_samples),
    'Permit_Months': np.random.randint(1, 12, n_samples)
}

df = pd.DataFrame(data)
df['GFA'] = df.apply(lambda x: calculate_gfa(x['Length'], x['Breadth'], x['Storeys'], x['Shape']), axis=1)
df = calculate_features(df)

In [41]:
# Prepare ML data
X = df.drop(['Total_Cost', 'Duration', 'Material_Cost', 'GFA', 'Plant_Cost', 
             'Labor_Hours', 'Labor_Productivity'], axis=1)
y = df['Total_Cost']

In [42]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Preprocess and train
preprocessor = ColumnTransformer([
    ('encode', OneHotEncoder(), ['Type', 'Location', 'Soil', 'Access', 'Shape', 'Weather'])
], remainder='passthrough')

model = RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42)
model.fit(preprocessor.fit_transform(X_train), y_train)


In [44]:
# Calculate metrics
from sklearn.metrics import r2_score, mean_absolute_error

# Train metrics
y_train_pred = model.predict(preprocessor.transform(X_train))
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)

In [45]:
# Test metrics
y_pred = model.predict(preprocessor.transform(X_test))
test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)

metrics = {
    'train_r2': train_r2,
    'train_mae': train_mae,
    'test_r2': test_r2,
    'test_mae': test_mae
}

print('Training R²:', metrics['train_r2'])
print('Training MAE:', metrics['train_mae'])
print('Validation R²:', metrics['test_r2'])
print('Validation MAE:', metrics['test_mae'])

Training R²: 0.9944284355917168
Training MAE: 36047819.39166999
Validation R²: 0.9548909449238221
Validation MAE: 100033463.54506245


In [46]:
# Save artifacts
pickle.dump(model, open('cost_only_model.pkl', 'wb'))
pickle.dump(preprocessor, open('preprocessor_cost.pkl', 'wb'))
pickle.dump(metrics, open('metrics_cost.pkl', 'wb'))