# **Importing necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# **Loading the nasa dataset**
### The NASA dataset (POWER Data) provides daily climate variables such as air temperature (T2M, T2M_MAX, T2M_MIN), surface pressure (PS), precipitation (PRECTOTCORR), relative humidity (RH2M), and wind speed (WS2M), recorded with respect to year (YEAR) and day of year (DOY). It represents continuous environmental conditions for the selected geographic location.

In [None]:
nasa_data = pd.read_csv("nasa(India).csv", skiprows=15)

In [None]:
nasa_data

In [None]:
nasa_data.replace(-999, np.nan, inplace=True)


In [None]:
nasa_data.dropna(inplace=True)

# **Summary of the nasa dataset**

In [None]:
nasa_data.info()

# **descriptive statistics of the dataset**

In [None]:
nasa_data.describe()

# **Checking for the null values**

In [None]:
nasa_data.isnull().sum()

# **Checking for the duplicated values**

In [None]:
nasa_data.duplicated().sum()

# **Loading the EMD-DAT data**
### The EM-DAT dataset (Emergency Events Database) contains historical disaster records, including event details such as disaster group, type, subtype, location, country, start and end dates, and socio-economic impacts (e.g., Total Deaths, Total Affected, Total Damage, Insured Damage). It provides structured information on disaster occurrences and their consequences.

In [None]:
emd_data = pd.read_excel("EMD_data.xlsx")

In [None]:
emd_data

# **Summary of the EMD-DAT dataset**

In [None]:
emd_data.info()

# **descriptive statistics of the dataset**

In [None]:
emd_data.describe()

# **Creating seperate date column in the nasa dataset**

In [None]:
nasa_data['DATE'] = pd.to_datetime(nasa_data['YEAR'].astype(str), format='%Y') + \
                  pd.to_timedelta(nasa_data['DOY'] - 1, unit='D')


In [None]:
nasa_data

# **Aggregating NASA daily data into yearly summaries**

In [None]:
df1_nasa=nasa_data.groupby("DATE").agg({
    "T2M": "mean",             # average annual temperature
    "T2M_MAX": "mean",         # average of daily max temps
    "T2M_MIN": "mean",         # average of daily min temps
    "PS": "mean",              # average surface pressure
    "PRECTOTCORR": "sum",      # total annual precipitation
    "RH2M": "mean",            # average humidity
    "WS2M": "mean"             # average wind speed
}).reset_index()


In [None]:
df1_nasa

# **Checking for the null values**

In [None]:
df1_nasa.isnull().sum()

# **Checking for the duplicated values**

In [None]:
df1_nasa.duplicated().sum()

# **Creating date column in the EMD-DAT dataset**

In [None]:
emd_data['Start Day'] = emd_data['Start Day'].fillna(1)   # assume 1st of the month
emd_data['Start Month'] = emd_data['Start Month'].fillna(1)  # assume January if missing
emd_data['DATE'] = pd.to_datetime(dict(
    year=emd_data['Start Year'],
    month=emd_data['Start Month'],
    day=emd_data['Start Day']
))


# **Adding the flood labels to nasa dataset**

In [None]:
flood_dates = emd_data[emd_data['Disaster Type'] == 'Flood']['DATE'].dt.date.unique()

# Create Flood label column (1 if date matches flood date, else 0)
df1_nasa['Flood'] = df1_nasa['DATE'].dt.date.isin(flood_dates).astype(int)


# **Updated NASA dataset along with flood labels**

In [None]:
df1_nasa

In [None]:
df1_nasa['Flood'].value_counts()

# **Keeping only necessary columns in the EMD-DAT dataset**

In [None]:
emd_india = emd_data[[
    "DisNo.", "Disaster Group", "Disaster Type","Disaster Subtype", "Location",
    "DATE", "Total Deaths", "Total Affected",
]]


# **Checking for the null values**

In [None]:
emd_india.isnull().sum()

# **Filling the missing values**

In [None]:
emd_india['Total Deaths']=emd_india['Total Deaths'].fillna(emd_india['Total Deaths'].median())

In [None]:
emd_india['Total Affected']=emd_india['Total Affected'].fillna(emd_india['Total Affected'].median())

# **Checking for the null values**

In [None]:
emd_india.isnull().sum()

# **Checking for the duplicated values**

In [None]:
emd_india.duplicated().sum()

In [None]:
emd_india

# **Merging the NASA and EMD-DAT datasets**
### After merging the EM-DAT disaster dataset with the NASA climate dataset, the resulting data links disaster events with their corresponding daily climate conditions. Each disaster entry from EM-DAT (with details like type, location, total deaths, and people affected) is now matched with NASA’s environmental variables for the same date (temperature, rainfall, humidity, wind speed, etc.). This combined dataset enables deeper analysis of how specific climate factors might relate to the occurrence, severity, and impacts of disasters.

In [None]:
merged = pd.merge(
    df1_nasa,       # climate dataset
    emd_india,        # disaster dataset
    on="DATE",     # common column
    how="left"    
)


In [None]:
merged

# **Filling the null values**

In [None]:
## Categorical Data
cat_cols = ["DisNo.","Location","Disaster Group","Disaster Type", "Disaster Subtype"]
merged[cat_cols] = merged[cat_cols].fillna("0")


In [None]:
merged.drop(columns=["Total Deaths","Total Affected"],axis=1,inplace=True)


# **Final Merged Dataset**

In [None]:
merged

# **Checking for the null values of the merged dataset**

In [None]:
merged.isnull().sum()

# **Checking for the duplicated values**

In [None]:
merged.duplicated().sum()

# **Summary of the merged dataset**

In [None]:
merged.info()

# **descriptive statistics of the dataset**

In [None]:
merged.describe()

# **Heatmap**

In [None]:
merged.drop(columns=['T2M_MAX','T2M_MIN'],inplace=True)
corr_matrix=merged.select_dtypes(include=['number']).corr()
sns.heatmap(corr_matrix,annot=True)

# **Boxplots for checking any outliers**

In [None]:
for i in merged.columns:
    if merged[i].dtype!='object':
        sns.boxplot(merged[i])
        plt.title(i)
        plt.show()

# **Bar plot for flood vs non flood**

In [None]:
merged['Flood'].value_counts().plot(kind='bar')


# **Subplots**

In [None]:
fig,axs=plt.subplots(3,2,figsize=(12,12))
sns.boxplot(data=merged,y='T2M',x='Flood',ax=axs[0,0])
axs[0,0].set_title("Temperature vs Flood")


sns.boxplot(data=merged,y='PS',x='Flood',ax=axs[0,1])
axs[0,1].set_title("Pressure vs Flood")


sns.boxplot(data=merged,y='PRECTOTCORR',x='Flood',ax=axs[1,0])
axs[1,0].set_title("Precipitation  vs Flood")


sns.boxplot(data=merged,y='RH2M',x='Flood',ax=axs[1,1])
axs[1,1].set_title("Humidity  vs Flood")


sns.boxplot(data=merged,y='WS2M',x='Flood',ax=axs[2,0])
axs[2,0].set_title("Wind Speed vs Flood")
plt.show()

In [None]:
merged['DATE'] = pd.to_datetime(merged['DATE'])


In [None]:
merged['Year'] = merged['DATE'].dt.year
merged['Month'] = merged['DATE'].dt.month
merged['Season'] = merged['Month']%12 // 3 + 1  # 1=Winter, 2=Summer, 3=Monsoon, 4=Post-Monsoon


# **Countplot for flood occurances by month and seasons**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Flood count by month
plt.figure(figsize=(10,5))
sns.countplot(data=merged, x='Month', hue='Flood')
plt.title("Flood occurrences by Month")
plt.show()

# Flood count by season
plt.figure(figsize=(8,5))
sns.countplot(data=merged, x='Season', hue='Flood')
plt.title("Flood occurrences by Season")
plt.show()


# **Line plot for yearly flood occurances**

In [None]:
yearly = merged.groupby('Year')['Flood'].sum()

plt.figure(figsize=(10,5))
sns.lineplot(x=yearly.index, y=yearly.values, marker="o")
plt.title("Yearly Flood Occurrences")
plt.xlabel("Year")
plt.ylabel("Number of Floods")
plt.show()


# **Standard Scaling**

In [None]:
X = merged.drop(['Flood','DATE'], axis=1)

# Select numeric features again (should now be more than before)
X = X.select_dtypes(include=['int64','float64'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y = merged['Flood']


In [None]:
X

In [None]:
y

# **Training and splitting the model**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


# **Using the SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [None]:
X_train_res

In [None]:
y_train_res

# **Training the model**

### **Using Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(X_train_res, y_train_res)

y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


### **Using Random forest classifier**

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


## **Precision-Recall Curve**

In [None]:
y_proba_rf = rf.predict_proba(X_test)[:,1]

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_proba_rf)
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Random Forest")
plt.show()


### **Using  XGBClassifier**

In [None]:
best_xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',  # instead of logloss
    scale_pos_weight=scale_pos_weight,
    colsample_bytree=1.0,
    learning_rate=0.05,
    max_depth=5,
    n_estimators=500,
    subsample=0.8,
    random_state=42
)
best_xgb.fit(X_train, y_train)


In [None]:
y_proba_aucpr = best_xgb.predict_proba(X_test)[:, 1]


In [None]:
y_proba_aucpr 


## **Precision-Recall Curve**

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_proba_aucpr)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (XGB with aucpr)")
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

# Get precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_aucpr)

# Compute F1 for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)

# Best threshold = threshold that maximizes F1
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print("🔹 Best threshold by F1:", best_threshold)
print("Precision:", precision[best_idx])
print("Recall:", recall[best_idx])
print("F1 Score:", f1_scores[best_idx])

# Apply the chosen threshold
y_pred_best = (y_proba_aucpr >= best_threshold).astype(int)

# Final evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))

# Optional: plot to visualize
plt.figure(figsize=(8,5))
plt.plot(thresholds, precision[:-1], label="Precision")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.plot(thresholds, f1_scores[:-1], label="F1")
plt.axvline(x=best_threshold, color='r', linestyle='--', label=f"Best Thresh = {best_threshold:.4f}")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.title("Precision, Recall, and F1 vs Threshold")
plt.show()
