Dataset features got cleaned.

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset (Update the file path as needed)
file_path = r"E:\Capstone_project_25\dataset_with_features.csv"
df = pd.read_csv(file_path)

# Fill text-based columns with "Unknown"
text_columns = ["source_link", "event_description", "location_description", 
                "photo_link", "notes", "storm_name", "admin_division_name"]
df[text_columns] = df[text_columns].fillna("Unknown")

# Fill numerical columns based on their types
# Fatality and Injury counts are set to zero if missing
df["fatality_count"] = df["fatality_count"].fillna(0)
df["injury_count"] = df["injury_count"].fillna(0)

# Fill critical numerical features with median values
num_columns_median = ["Aspect", "Slope", "temperature", "precipitation", 
                      "wind_speed", "distance_to_nearest_landslide"]
for col in num_columns_median:
    df[col] = df[col].fillna(df[col].median())

# Save the cleaned dataset
cleaned_file_path = r"E:\Capstone_project_25\dataset_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

print("Dataset cleaning completed. Cleaned dataset saved as 'dataset_cleaned.csv'.")


: 

Synthetic 'no landlside data ' generation for dataset.

In [3]:
import numpy as np
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv(r"E:\Capstone_project_25\dataset_cleaned (1).csv")

# Number of synthetic "no landslide" samples to generate (50% of dataset size)
num_synthetic = len(df) // 2

# Randomly sample existing data as a base for "no landslide" cases
synthetic_no_landslide = df.sample(n=num_synthetic, random_state=42).copy()

# Modify key environmental conditions to simulate "no landslide" scenarios
if 'precipitation' in synthetic_no_landslide.columns:
    synthetic_no_landslide['precipitation'] *= np.random.uniform(0.3, 0.7, num_synthetic)  # Reduce rainfall

if 'wind_speed' in synthetic_no_landslide.columns:
    synthetic_no_landslide['wind_speed'] *= np.random.uniform(0.5, 0.9, num_synthetic)  # Reduce wind speed

if 'slope' in synthetic_no_landslide.columns:
    synthetic_no_landslide['slope'] *= np.random.uniform(0.6, 0.9, num_synthetic)  # Lower slope

# Set the target variable to 0 (No Landslide)
synthetic_no_landslide['landslide_occurred'] = 0

# Combine original dataset (landslide = 1) with synthetic data (landslide = 0)
df_balanced = pd.concat([df, synthetic_no_landslide], ignore_index=True)

# Save the new dataset
df_balanced.to_csv("dataset_with_synthetic_no_landslide.csv", index=False)

print("Synthetic no landslide cases added and dataset saved as 'dataset_with_synthetic_no_landslide.csv'")


Synthetic no landslide cases added and dataset saved as 'dataset_with_synthetic_no_landslide.csv'


Balancing with SMOTE

In [8]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load dataset
df_balanced = pd.read_csv(r"E:\Capstone_project_25\dataset_with_synthetic_no_landslide (1).csv")  # Ensure correct file path

# Fill missing values with a placeholder string
df_balanced.fillna("Unknown", inplace=True)

# Identify categorical columns
categorical_cols = df_balanced.select_dtypes(include=['object', 'float', 'int']).columns

# Convert all categorical columns to string type before encoding
df_balanced[categorical_cols] = df_balanced[categorical_cols].astype(str)

# Convert categorical variables to numeric using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_balanced[col] = le.fit_transform(df_balanced[col])
    label_encoders[col] = le  # Store encoders for later use

# Define features (X) and target (y)
X = df_balanced.drop(columns=['landslide_occurred'])  # Features
y = df_balanced['landslide_occurred']  # Target

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame
df_smote = pd.DataFrame(X_resampled, columns=X.columns)
df_smote['landslide_occurred'] = y_resampled  # Add target column

# Save the cleaned & SMOTE-balanced dataset
df_smote.to_csv(r"E:\Capstone_project_25\dataset_balanced_with_SMOTE.csv", index=False)

print("✔ SMOTE-balanced dataset saved successfully as 'dataset_balanced_with_SMOTE.csv'.")


  df_balanced.fillna("Unknown", inplace=True)


✔ SMOTE-balanced dataset saved successfully as 'dataset_balanced_with_SMOTE.csv'.
