# Exploratory Data Analysis (EDA)

## Import Data & Overview

### Import Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("C:/Users/MRE/Documents/GitHub/Water_Qualityy/water_potability.csv")

data.head()

### Initial Overview

In [None]:
data.shape

### Column Names and Data Types

In [None]:
data.dtypes

### Summary Statistics

In [None]:
data.describe()

### Checking for Missing Values

In [None]:
data.isnull().sum()

### Handling Missing Values

In [None]:
data['ph'] = data['ph'].fillna(data['ph'].mean())
data['Sulfate'] = data['Sulfate'].fillna(data['Sulfate'].mean())
data['Trihalomethanes'] = data['Trihalomethanes'].fillna(data['Trihalomethanes'].mean())

data.isnull().sum()

### Feature Distribution

In [None]:
data.hist(bins=30, figsize=(20, 15))
plt.tight_layout()
plt.show()


### Outlier Analysis

In [None]:
plt.figure(figsize=(20, 15))
sns.boxplot(data=data)
plt.xticks(rotation = 90)
plt.show()

### Outlier Analysis for Individual Features

In [None]:
features = data.columns
plt.figure(figsize= (20, 15))

for i, feature in enumerate(features, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(data[feature])
    plt.title(feature)

plt.tight_layout()
plt.show()

### Bivariate Analysis

In [None]:
plt.figure(figsize=(20, 15))
for i, feature in enumerate(features[:-1], 1):  # Except Potability feature
    plt.subplot(4, 3, i)
    sns.boxplot(x='Potability', y=feature, data=data)
    plt.title(f'Potability vs {feature}')
plt.tight_layout()
plt.show()

## Correlation Analysis

### Correlation Matrix

In [None]:
corr_matrix = data.corr()

plt.figure(figsize=(20,15))

sns.heatmap(corr_matrix, annot = True, cmap = "coolwarm")
plt.title("Correlation Matrix")
plt.show()

### Train-Test Split & Normalization

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop('Potability', axis = 1)
y = data['Potability']

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Normalization
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Checking results
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

## Modelling

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#DTC Modelling
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

#Prediction
y_pred = dt_model.predict(X_test)

#Performance Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

### Hyperparameter Tuning with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#Hyperparameter Grid for DTC
param_grid = {
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'criterion': ['gini', 'entropy']
}


#DTC Modelling
dt_model = DecisionTreeClassifier(random_state = 42)

#Hyperparameter Tuning with GridSearchCV
grid_search = GridSearchCV(
    estimator = dt_model,
    param_grid = param_grid,
    cv = 5,
    n_jobs = -1,
    scoring = 'accuracy'
)

# Appyling GridSearch on Train Data
grid_search.fit(X_train, y_train)

# For the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score}")

# Applying with Best of's on Train Data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

#Performance Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

<p>Despite the expanded hyperparameter grid, we see that the best parameters and model performance are the same as previous results. This indicates that the current hyperparameter range is sufficient and we should try other methods to further improve the performance of the model.</p>

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Hyperparameter Grid for Random Forest Model
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

#Random Forest Model
rf_model = RandomForestClassifier(random_state = 42)

# Hyperparameter Tuning with GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid = param_grid_rf,
    cv = 5,
    n_jobs = 1,
    scoring = 'accuracy'
)

# Execure the grid on train data
grid_search_rf.fit(X_train, y_train)

# Getting best parameters and scores
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print(f"Best Parameters: {best_params_rf}")
print(f"Best Cross-Validation Accuracy: {best_score_rf}")

### Model Evaluation with Best Model

In [None]:
best_model_rf = grid_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)

#Performance Evaluation
# Performans değerlendirmesi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred_rf))

<p> <b>Conclusion:</b>
Improvement: Random Forest model performed better compared to Decision Tree model. The overall accuracy rate and the rate of correctly predicting potable water (recall) have increased.
Weakness: The rate of correctly predicting potable water is still low, but the Random Forest model gave better results than Decision Tree in this regard.</p>

### Expanding Hyperparameter Settings

In [None]:
# param_grid_rf = {
#     'n_estimators': [100, 200, 300, 500],
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10, 15],
#     'min_samples_leaf': [1, 2, 4, 6],
#     'criterion': ['gini', 'entropy']
# }

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini']
}

rf_model = RandomForestClassifier(random_state = 42)

grid_search_rf = GridSearchCV(
    estimator = rf_model,
    param_grid = param_grid_rf,
    cv = 5,
    n_jobs = 1,
    scoring = 'accuracy'
)

grid_search_rf.fit(X_train, y_train)

# For the best Parameter & Scores
best_params_rf = grid_search_rf.best_params_
best_scores_rf = grid_search_rf.best_score_

print(f"Best Parameters: {best_params_rf}")
print(f"Best Cross-Validation Accuracy: {best_score_rf}")

# Implementing with best estimators
best_model_rf = grid_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)

#Performance Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred_rf))

<p>When we examine the results, we see that they are the same as the previous results. This shows that the performance of our model is limited even with the extended hyperparameter grid.</p>

## Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#Logistic Regression Model
log_reg_model = LogisticRegression(random_state = 42, solver = 'liblinear')

# Hyperparameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Hyperparameter Settings with GridSearchCV
grid_search_lr = GridSearchCV(
    estimator = log_reg_model,
    param_grid = param_grid_lr,
    cv = 5,
    n_jobs = 1,
    scoring = 'accuracy'
)

grid_search_lr.fit(X_train, y_train)

# For the best Parameter & Scores
best_params_lr = grid_search_lr.best_params_
best_scores_lr = grid_search_lr.best_score_

print(f"Best Parameters: {best_params_lr}")
print(f"Best Cross-Validation Accuracy: {best_scores_lr}")

# Implementing with best estimators
best_model_lr = grid_search_lr.best_estimator_
y_pred_lr = best_model_lr.predict(X_test)

# Model Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred_lr))


<p>These results show that the Logistic Regression model cannot accurately predict potable water (Class 1) in the data set.</p>

<p>General evaluation:</p>
<p>Decision Tree Classifier: Performed moderately. It remained weak in its definitions of potable water.</p>
<p>Random Forest Classifier: Achieved the highest accuracy ranges, but still performed poorly at identifying potable water.</p>
<p>Logistic Regression: Potable water was never identified due to class imbalance. This model showed poor performance.</p>