In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'waterQuality1.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
print(data.info())
print(data.describe())


In [None]:
# Histograms for all features
data.hist(figsize=(20, 20))
plt.show()


In [None]:
# Histograms for all features
data.hist(figsize=(20, 20))
plt.show()


In [None]:
# Correlation matrix
corr_matrix = data.corr()

# Heatmap of the correlation matrix
plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()



In [None]:
# Scatter plot matrix
sns.pairplot(data)
plt.show()


In [None]:
import numpy as np

data['is_safe'].replace('#NUM!', np.nan, inplace=True)


In [None]:
data['is_safe'] = pd.to_numeric(data['is_safe'], errors='coerce')


In [None]:
data_cleaned = data.dropna(subset=['is_safe', 'ammonia'])


In [None]:
rows_dropped = len(data) - len(data_cleaned)
print(f"Number of rows dropped: {rows_dropped}")
data_cleaned.info()


In [None]:
import pandas as pd

# Load the dataset
file_path = 'waterQuality1.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
print(data.info())
print(data.describe())


In [None]:
import matplotlib.pyplot as plt

# Histograms for all features
data.hist(figsize=(20, 20))
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
corr_matrix = data.corr()

# Heatmap of the correlation matrix
plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()


In [None]:
import seaborn as sns

# Scatter plot matrix
sns.pairplot(data)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [None]:
# Load the dataset
file_path = 'waterQuality1.csv'
data = pd.read_csv(file_path)


In [None]:
# Replace '#NUM!' with NaN in 'is_safe' column
data['is_safe'].replace('#NUM!', np.nan, inplace=True)

# Convert 'is_safe' column to numeric
data['is_safe'] = pd.to_numeric(data['is_safe'], errors='coerce')

# Convert 'ammonia' column to numeric
data['ammonia'] = pd.to_numeric(data['ammonia'], errors='coerce')

# Drop rows with NaNs in 'is_safe' and 'ammonia' columns
data_cleaned = data.dropna(subset=['is_safe', 'ammonia'])


In [None]:
# Check for categorical variables
categorical_cols = data_cleaned.select_dtypes(include=['object']).columns

# One-Hot Encode categorical variables
data_cleaned = pd.get_dummies(data_cleaned, columns=categorical_cols, drop_first=True)


In [None]:
# Define features and target variable
X = data_cleaned.drop('is_safe', axis=1)
y = data_cleaned['is_safe']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize the Random Forest model with default parameters
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)


In [None]:
# Train the model on the training data
rf_model.fit(X_train, y_train)


In [None]:
# Predict on the test data
y_pred = rf_model.predict(X_test)


In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Display the classification report
print(classification_report(y_test, y_pred))


In [None]:
# Get feature importances
feature_importances = rf_model.feature_importances_
features = X.columns

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()


In [None]:
# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

print(f'Best Parameters: {best_params}')


In [None]:
# Predict on the test data with the best model
y_pred_best = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

# Display metrics
print(f'Optimized Accuracy: {accuracy_best}')
print(f'Optimized Precision: {precision_best}')
print(f'Optimized Recall: {recall_best}')
print(f'Optimized F1-Score: {f1_best}')
