In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

df = pd.read_excel('tarrodan_dam.csv')

print("Shape of data:", df.shape)
print("\nMissing values in each column:")
print(df.isnull().sum())

def clean_assessment(value):
    if pd.isna(value) or value == 'undefined':
        return 'Not Available'
    return value

df['Assessment'] = df['Assessment'].apply(clean_assessment)

In [None]:
def clean_date(date_str):
    if pd.isna(date_str):
        return np.nan
    try:
        return pd.to_datetime(date_str)
    except:
        return np.nan

df['Last Inspection Date'] = df['Last Inspection Date'].apply(clean_date)
df['Assessment Date'] = df['Assessment Date'].apply(clean_date)

print("\nUnique hazard values before cleaning:")
print(df['Hazard'].value_counts())

df['Hazard'] = df['Hazard'].replace('Undetermined', 'Low')

numeric_cols = ['Height (m)', 'Length (km)', 'Volume (m3)', 
                'Surface (km2)', 'Drainage (km2)', 'Probability of Failure']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.to_excel('tarrodan_dam_cleaned.xlsx', index=False)

print("\nCleaning Summary:")
print("1. Standardized Assessment ratings")
print("2. Cleaned date formats")
print("3. Handled Undetermined hazard cases")
print("4. Converted numeric columns to proper type")

In [None]:
df = pd.read_excel('tarrodan_dam.csv')

df['Assessment'] = df['Assessment'].fillna('Not Available')
df['Assessment'] = df['Assessment'].replace('undefined', 'Not Available')

df['Last Inspection Date'] = pd.to_datetime(df['Last Inspection Date'], errors='coerce')
df['Assessment Date'] = pd.to_datetime(df['Assessment Date'], errors='coerce')

df['Hazard'] = df['Hazard'].replace('Undetermined', 'Low')

numeric_cols = ['Height (m)', 'Length (km)', 'Volume (m3)', 'Surface (km2)', 'Drainage (km2)']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    median_by_region = df.groupby('Region')[col].transform('median')
    df[col] = df[col].fillna(median_by_region)

loss_cols = ['Loss given failure - prop (Qm)', 'Loss given failure - liab (Qm)', 'Loss given failure - BI (Qm)']
for col in loss_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    median_by_hazard = df.groupby('Hazard')[col].transform('median')
    df[col] = df[col].fillna(median_by_hazard)

df['Years Since Inspection'] = (pd.Timestamp.now() - df['Last Inspection Date']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
df['Years Since Inspection'] = df['Years Since Inspection'].fillna(df['Years Since Inspection'].median())

df['Total Loss Given Failure'] = df['Loss given failure - prop (Qm)'] + df['Loss given failure - liab (Qm)'] + df['Loss given failure - BI (Qm)']

df['Risk Score'] = df['Probability of Failure'] * df['Total Loss Given Failure']

df['Hazard_Numeric'] = df['Hazard'].map({'Low': 1, 'Significant': 2, 'High': 3})

df.to_excel('tarrodan_dam_cleaned_v2.xlsx', index=False)

print("Missing values after cleaning:")
print(df.isnull().sum())

print("\nRisk Score Summary by Region:")
print(df.groupby('Region')['Risk Score'].describe())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer

df = pd.read_excel('tarrodan_dam_cleaned_v2.xlsx')

# Create more sophisticated features
df['Age'] = 2025 - pd.to_numeric(df['Year Completed'], errors='coerce')
df['Maintenance_Score'] = pd.Categorical(df['Assessment']).codes  # Convert assessment to numeric
df['Population_Risk'] = df['Hazard_Numeric'] * (1 / (df['Distance to Nearest City (km)'] + 1))
df['Volume_Risk'] = df['Volume (m3)'] * df['Height (m)'] / 1000000  # Normalized volume risk

# Separate losses by type for more detailed modeling
df['Property_Loss_Weight'] = df['Loss given failure - prop (Qm)'] / df['Total Loss Given Failure']
df['Liability_Loss_Weight'] = df['Loss given failure - liab (Qm)'] / df['Total Loss Given Failure']
df['Business_Loss_Weight'] = df['Loss given failure - BI (Qm)'] / df['Total Loss Given Failure']

features = [
    'Height (m)',
    'Volume (m3)',
    'Hazard_Numeric',
    'Surface (km2)',
    'Age',
    'Maintenance_Score',
    'Population_Risk',
    'Volume_Risk',
    'Property_Loss_Weight',
    'Liability_Loss_Weight',
    'Business_Loss_Weight'
]

X = df[features]
y = df['Total Loss Given Failure'] * df['Probability of Failure']

# Handle missing values
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_split=20,
    learning_rate=0.05,
    random_state=42
)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("Model Performance:")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

In [None]:
df['Age'] = 2025 - pd.to_numeric(df['Year Completed'], errors='coerce')
df['Age'] = df['Age'].fillna(df['Age'].median())  
df['Maintenance_Score'] = pd.Categorical(df['Assessment']).codes
df['Volume_Risk'] = df['Volume (m3)'] * df['Height (m)'] / 1000000



df['Hazard_Numeric'] = df['Hazard_Numeric'].fillna(df['Hazard_Numeric'].median())
df['Distance to Nearest City (km)'] = df['Distance to Nearest City (km)'].fillna(df['Distance to Nearest City (km)'].median())
df['Population_Risk'] = df['Hazard_Numeric'] * (1 / (df['Distance to Nearest City (km)'] + 1))


df['Property_Loss_Weight'] = df['Loss given failure - prop (Qm)'] / df['Total Loss Given Failure']
df['Liability_Loss_Weight'] = df['Loss given failure - liab (Qm)'] / df['Total Loss Given Failure']
df['Business_Loss_Weight'] = df['Loss given failure - BI (Qm)'] / df['Total Loss Given Failure']




In [None]:
print(df[features].isnull().sum())


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error

model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_split=20,
    learning_rate=0.05,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Model Performance:")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.4f}")





In [None]:
target_cost = 'Total Loss Given Failure'

y_cost = df[target_cost]

X_train, X_test, y_train, y_test = train_test_split(X, y_cost, test_size=0.2, random_state=42)

model_cost = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_split=20,
    learning_rate=0.05,
    random_state=42
)

model_cost.fit(X_train, y_train)

y_pred_cost = model_cost.predict(X_test)

print("Cost Model Performance:")
print(f"R2 Score: {r2_score(y_test, y_pred_cost):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred_cost):.4f}")


In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance for Probability of Failure:")
print(feature_importance)


feature_importance_cost = pd.DataFrame({
    'feature': X.columns,
    'importance': model_cost.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance for Cost of Failure:")
print(feature_importance_cost)


from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE for Probability of Failure: {-cv_scores.mean():.4f}")

cv_scores_cost = cross_val_score(model_cost, X, y_cost, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE for Cost of Failure: {-cv_scores_cost.mean():.4f}")



In [None]:

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.read_excel('tarrodan_dam_cleaned_v2.xlsx')

print("Available columns:")
print(df.columns.tolist())

df['Age'] = 2025 - pd.to_numeric(df['Year Completed'], errors='coerce')
df['Age'] = df['Age'].fillna(df['Age'].median())

df['Maintenance_Score'] = pd.Categorical(df['Assessment']).codes

if 'Hazard_Numeric' not in df.columns:
    df['Hazard_Numeric'] = df['Hazard'].map({'Low': 1, 'Significant': 2, 'High': 3})

df['Population_Risk'] = df['Hazard_Numeric'] * (1 / (df['Distance to Nearest City (km)'].fillna(df['Distance to Nearest City (km)'].median()) + 1))

df['Volume_Risk'] = df['Volume (m3)'] * df['Height (m)'] / 1000000

df['Property_Loss_Weight'] = df['Loss given failure - prop (Qm)'] / df['Total Loss Given Failure']
df['Liability_Loss_Weight'] = df['Loss given failure - liab (Qm)'] / df['Total Loss Given Failure']
df['Business_Loss_Weight'] = df['Loss given failure - BI (Qm)'] / df['Total Loss Given Failure']

for col in ['Property_Loss_Weight', 'Liability_Loss_Weight', 'Business_Loss_Weight']:
    df[col] = df[col].fillna(0)

features = ['Height (m)', 'Volume (m3)', 'Hazard_Numeric', 'Surface (km2)', 
            'Age', 'Maintenance_Score', 'Population_Risk', 'Volume_Risk',
            'Property_Loss_Weight', 'Liability_Loss_Weight', 'Business_Loss_Weight']

X = df[features]
y_prob = df['Probability of Failure']
y_cost = df['Total Loss Given Failure']

X_train, X_test, y_prob_train, y_prob_test, y_cost_train, y_cost_test = train_test_split(
    X, y_prob, y_cost, test_size=0.2, random_state=42)

model_prob = GradientBoostingRegressor(n_estimators=200, max_depth=5, min_samples_split=20, 
                                      learning_rate=0.05, random_state=42)
model_prob.fit(X_train, y_prob_train)

model_cost = GradientBoostingRegressor(n_estimators=200, max_depth=5, min_samples_split=20, 
                                      learning_rate=0.05, random_state=42)
model_cost.fit(X_train, y_cost_train)

y_prob_pred = model_prob.predict(X_test)
y_cost_pred = model_cost.predict(X_test)

print("Failure Probability Model Performance:")
print(f"R2 Score: {r2_score(y_prob_test, y_prob_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_prob_test, y_prob_pred):.4f}")

print("\nFailure Cost Model Performance:")
print(f"R2 Score: {r2_score(y_cost_test, y_cost_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_cost_test, y_cost_pred):.2f}")

df['Predicted_Failure_Probability'] = model_prob.predict(X)
df['Predicted_Failure_Cost'] = model_cost.predict(X)
df['Expected_Loss'] = df['Predicted_Failure_Probability'] * df['Predicted_Failure_Cost']

total_expected_loss = df['Expected_Loss'].sum()
expected_loss_10_years = total_expected_loss * 10

print(f"\nTotal Expected Loss: ${total_expected_loss:,.2f}")
print(f"Expected Loss over 10 Years: ${expected_loss_10_years:,.2f}")


In [None]:
import numpy as np

annual_deterioration = 0.05

years = 10
yearly_probabilities = np.zeros((len(df), years))
yearly_losses = np.zeros(years)

yearly_probabilities[:, 0] = df['Predicted_Failure_Probability'].values

for year in range(1, years):
    yearly_probabilities[:, year] = np.minimum(
        yearly_probabilities[:, year-1] * (1 + annual_deterioration),
        1.0  
    )
    
for year in range(years):
    yearly_losses[year] = (yearly_probabilities[:, year] * df['Predicted_Failure_Cost']).sum()

discount_rate = 0.03  
present_value_losses = yearly_losses / (1 + discount_rate) ** np.arange(years)

total_expected_loss_10_years = present_value_losses.sum()

print(f"Year-by-year expected losses:")
for year in range(years):
    print(f"Year {year+1}: ${yearly_losses[year]:,.2f}")
    
print(f"\nTotal expected loss over 10 years: ${total_expected_loss_10_years:,.2f}")
print(f"Simple 10-year projection (for comparison): ${yearly_losses[0] * 10:,.2f}")

In [None]:
avg_failure_prob = df['Predicted_Failure_Probability'].mean() * 100  # Convert to percentage
high_risk_threshold = df['Predicted_Failure_Probability'].quantile(0.90)
high_risk_dams = df[df['Predicted_Failure_Probability'] > high_risk_threshold]
high_risk_count = len(high_risk_dams)
high_risk_avg_prob = high_risk_dams['Predicted_Failure_Probability'].mean() * 100  # Convert to percentage

print(f"Average failure probability across all dams: {avg_failure_prob:.2f}%")
print(f"Number of high-risk dams (top 10%): {high_risk_count}")
print(f"Average failure probability for high-risk dams: {high_risk_avg_prob:.2f}%")
print(f"High-risk dams by region:")
print(high_risk_dams['Region'].value_counts())