In [1]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")  # Make sure the file path is correct

# Encode categorical columns
label_encoders = {}
for col in ['Reservoir_name', 'SUBDIVISION']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target variable
X = df.drop(columns=['Level'])  # Features
y = df['Level']                 # Target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score (Accuracy): {r2 * 100:.2f}%")


Mean Squared Error: 361.46
R² Score (Accuracy): 99.12%


In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Convert 'Level' (continuous) into categories for classification:
# Example thresholds: Low < 50, Medium 50–150, High > 150
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)  # Convert to integer for classification

# Split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy * 100:.2f}%\n")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 89.53%

Confusion Matrix:
[[  1   5   0]
 [  0  54  37]
 [  0  32 578]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.17      0.29         6
           1       0.59      0.59      0.59        91
           2       0.94      0.95      0.94       610

    accuracy                           0.90       707
   macro avg       0.84      0.57      0.61       707
weighted avg       0.90      0.90      0.89       707



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
# Low: <50, Medium: 50–150, High: >150
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train KNN classifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy * 100:.2f}%\n")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 90.66%

Confusion Matrix:
[[  1   5   0]
 [  0  56  35]
 [  0  26 584]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.17      0.29         6
           1       0.64      0.62      0.63        91
           2       0.94      0.96      0.95       610

    accuracy                           0.91       707
   macro avg       0.86      0.58      0.62       707
weighted avg       0.91      0.91      0.90       707



In [11]:

import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
  df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 85.43%

Confusion Matrix:
[[  6   0   0]
 [  2  88   1]
 [  0 100 510]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       0.47      0.97      0.63        91
           2       1.00      0.84      0.91       610

    accuracy                           0.85       707
   macro avg       0.74      0.93      0.80       707
weighted avg       0.93      0.85      0.87       707



In [12]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Features and target
X = df.drop(columns=['Level'])
y = df['Level']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features and target
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()

# Train SVR model
model = SVR(kernel='rbf')  # You can also try 'linear' or 'poly'
model.fit(X_train_scaled, y_train_scaled)

# Predict and inverse transform
y_pred_scaled = model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


Mean Squared Error: 5240.69
R² Score: 0.87


In [13]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 99.29%
Confusion Matrix:
[[  2   4   0]
 [  0  91   0]
 [  0   1 609]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         6
           1       0.95      1.00      0.97        91
           2       1.00      1.00      1.00       610

    accuracy                           0.99       707
   macro avg       0.98      0.78      0.82       707
weighted avg       0.99      0.99      0.99       707



In [14]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree without pruning (allow full depth)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 99.29%
Confusion Matrix:
[[  2   4   0]
 [  0  91   0]
 [  0   1 609]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         6
           1       0.95      1.00      0.97        91
           2       1.00      1.00      1.00       610

    accuracy                           0.99       707
   macro avg       0.98      0.78      0.82       707
weighted avg       0.99      0.99      0.99       707



In [15]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Dummy Classifier
dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)

# Predict and evaluate
y_pred = dummy.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 86.28%
Confusion Matrix:
[[  0   0   6]
 [  0   0  91]
 [  0   0 610]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00        91
           2       0.86      1.00      0.93       610

    accuracy                           0.86       707
   macro avg       0.29      0.33      0.31       707
weighted avg       0.74      0.86      0.80       707



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv("C:\\Users\\Rashmi\\Desktop\\EDA\\FinalDataset.csv")

# Encode categorical variables
for col in ['Reservoir_name', 'SUBDIVISION']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Create classification labels from 'Level'
df['Level_Class'] = pd.cut(df['Level'], bins=[-float('inf'), 50, 150, float('inf')], labels=[0, 1, 2])

# Define features and target
X = df.drop(columns=['Level', 'Level_Class'])
y = df['Level_Class'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN with a "bad" value of K (e.g. 1 = overfitting or 100 = underfitting)
bad_k = 1  # Try also with 100
model = KNeighborsClassifier(n_neighbors=bad_k)
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_scaled)

print(f"K = {bad_k}")
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


K = 1
Accuracy: 94.06%
Confusion Matrix:
[[  0   6   0]
 [  0  71  20]
 [  0  16 594]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.76      0.78      0.77        91
           2       0.97      0.97      0.97       610

    accuracy                           0.94       707
   macro avg       0.58      0.58      0.58       707
weighted avg       0.93      0.94      0.94       707



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
