In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [31]:
df = pd.read_csv("water_scarcity.csv")

In [32]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5856 entries, 0 to 5855
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   5856 non-null   object 
 1   StationCode            5856 non-null   object 
 2   State                  5856 non-null   object 
 3   District               5856 non-null   object 
 4   Rainfall (mm)          5856 non-null   float64
 5   Groundwater Level (m)  5856 non-null   float64
 6   Temperature (°C)       5856 non-null   float64
 7   River Water Level (m)  5856 non-null   float64
 8   Scarcity               5856 non-null   int64  
dtypes: float64(4), int64(1), object(4)
memory usage: 411.9+ KB
None


In [33]:
df.head()

Unnamed: 0,Date,StationCode,State,District,Rainfall (mm),Groundwater Level (m),Temperature (°C),River Water Level (m),Scarcity
0,2024-01-01,ASS_BAR,Assam,Barpeta,0.0,20.0,41.3,14.3,0
1,2024-01-02,ASS_BAR,Assam,Barpeta,0.0,20.0,35.4,12.7,0
2,2024-01-03,ASS_BAR,Assam,Barpeta,17.9,20.0,22.9,11.48,0
3,2024-01-04,ASS_BAR,Assam,Barpeta,0.0,20.0,27.5,10.31,0
4,2024-01-05,ASS_BAR,Assam,Barpeta,0.0,20.0,30.6,5.56,0


In [34]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns  

# Fill NaN values in numeric columns with the median
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())


In [35]:
numeric_columns = df.select_dtypes(include='number')
grouped_df = df.groupby(['Date','State', 'District'])[numeric_columns.columns].mean().reset_index()

In [36]:
grouped_df.to_csv('data.csv', index=False)

In [22]:
# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [23]:
# Normalize numerical features
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['float64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [24]:
X = df.drop(columns=["Scarcity"])
y = df["Scarcity"]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Use Random Forest for classification
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [26]:
df.head()

Unnamed: 0,Date,StationCode,State,District,Rainfall (mm),Groundwater Level (m),Temperature (°C),River Water Level (m),Scarcity
0,0,0,0,2,-0.744132,0.96632,1.352036,1.544453,0
1,1,0,0,2,-0.744132,0.96632,0.769923,1.151204,0
2,2,0,0,2,-0.436872,0.96632,-0.463369,0.851351,0
3,3,0,0,2,-0.744132,0.96632,-0.009518,0.563788,0
4,4,0,0,2,-0.744132,0.96632,0.296339,-0.603671,0


In [28]:
# Predictions and evaluation
y_pred = clf.predict(X_test)
print("Classification Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Accuracy: 0.9982935153583617
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1075
           1       1.00      0.98      0.99        97

    accuracy                           1.00      1172
   macro avg       1.00      0.99      0.99      1172
weighted avg       1.00      1.00      1.00      1172



In [29]:
import joblib

# Save the model
joblib.dump(clf, "random_forest_model.pkl")

['random_forest_model.pkl']