In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data from the single CSV file
data = pd.read_csv("water_potability.csv")

# Split the data into features (X) and target variable (y)
X = data.drop("Potability", axis=1)
y = data["Potability"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train_scaled)

# Transform the test data using the trained imputer
X_test_imputed = imputer.transform(X_test_scaled)

# Initialize the model
model = LogisticRegression()

# Train the model using the imputed features
model.fit(X_train_imputed, y_train)

# Make predictions using the imputed features
y_pred = model.predict(X_test_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Optionally, you can save the training and testing sets into separate CSV files
X_train.to_csv("train.csv", index=False)
X_test.to_csv("test.csv", index=False)
y_train.to_csv("train_labels.csv", index=False)
y_test.to_csv("test_labels.csv", index=False)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Display the first few rows of the datasets
print("Train Data:")
display(train_data.head())

print("\nTest Data:")
display(test_data.head())


Train Data:


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,6.623614,203.030141,17167.301297,6.049601,311.726288,410.243247,15.9145,65.021229,2.915166
1,6.6847,193.840931,34157.184474,9.876574,344.535407,498.063996,8.818757,66.659352,4.03066
2,6.83606,205.667718,18321.327502,6.712854,297.837188,494.484249,13.808923,70.714225,4.952508
3,,183.488839,12675.938962,9.777807,319.870584,482.445026,13.309723,46.85341,3.240419
4,6.406798,182.885137,17851.064021,7.462758,332.486731,398.779746,17.301617,64.070236,4.573968



Test Data:


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,,183.521107,20461.25271,7.333212,333.119476,356.369022,20.179029,67.019903,4.886634
1,6.643159,188.913541,32873.820022,6.791509,333.848842,336.561501,14.70681,67.844849,4.562198
2,7.846058,224.058877,23264.109968,5.922367,300.40262,387.971336,13.406737,43.075186,2.487969
3,7.160467,183.08931,6743.346066,3.803036,277.599099,428.036344,9.799625,90.035374,3.884891
4,6.61535,179.240661,26392.863612,9.30916,,496.363562,12.786595,78.262369,4.453443


In [None]:
# Basic information about the dataset
print("Train Data Info:")
train_data.info()

print("\nTest Data Info:")
test_data.info()


Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2225 non-null   float64
 1   Hardness         2620 non-null   float64
 2   Solids           2620 non-null   float64
 3   Chloramines      2620 non-null   float64
 4   Sulfate          1989 non-null   float64
 5   Conductivity     2620 non-null   float64
 6   Organic_carbon   2620 non-null   float64
 7   Trihalomethanes  2493 non-null   float64
 8   Turbidity        2620 non-null   float64
dtypes: float64(9)
memory usage: 184.3 KB

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               560 non-null    float64
 1   Hardness         656 non-null    float64
 2   Solids           65

In [None]:
# Summary statistics
print("Train Data Description:")
display(train_data.describe())

print("\nTest Data Description:")
display(test_data.describe())


Train Data Description:


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
count,2225.0,2620.0,2620.0,2620.0,1989.0,2620.0,2620.0,2493.0,2620.0
mean,7.088976,196.448911,22066.597807,7.107867,333.653302,426.420949,14.25186,66.355132,3.959903
std,1.58725,32.579075,8835.078941,1.588035,41.550603,81.189543,3.285042,16.245992,0.779162
min,0.0,73.492234,728.75083,0.352,129.0,181.483754,4.371899,8.175876,1.492207
25%,6.10676,176.832962,15736.815715,6.108402,307.987458,365.712099,12.054236,55.6974,3.429438
50%,7.021617,196.901673,20921.203012,7.107117,332.759029,422.033283,14.155598,66.621027,3.940678
75%,8.058136,216.690772,27203.70718,8.097686,360.086901,481.179357,16.519916,77.373813,4.497655
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739



Test Data Description:


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
count,560.0,656.0,656.0,656.0,506.0,656.0,656.0,621.0,656.0
mean,7.048289,196.052321,21804.391558,7.179827,334.257205,425.343073,14.417209,66.561534,3.994277
std,1.623135,34.077903,8501.105626,1.563033,40.923971,79.402849,3.398272,15.898702,0.785234
min,1.844538,47.432,320.942611,1.390871,214.460834,201.619737,2.2,0.738,1.45
25%,6.020104,177.318237,15412.316896,6.248315,306.756576,365.818716,12.113514,56.769915,3.471754
50%,7.127318,197.103467,20960.706296,7.19233,334.366456,420.334966,14.39729,66.623944,4.009908
75%,8.064724,216.474883,27673.298701,8.199597,359.344836,483.840022,16.70153,77.142104,4.513977
max,13.54124,317.338124,55334.702799,12.653362,475.73746,652.537592,27.006707,112.41221,6.494749


In [None]:
# Check for missing values
print("Missing values in train data:")
print(train_data.isnull().sum())

print("\nMissing values in test data:")
print(test_data.isnull().sum())


Missing values in train data:
ph                 395
Hardness             0
Solids               0
Chloramines          0
Sulfate            631
Conductivity         0
Organic_carbon       0
Trihalomethanes    127
Turbidity            0
dtype: int64

Missing values in test data:
ph                  96
Hardness             0
Solids               0
Chloramines          0
Sulfate            150
Conductivity         0
Organic_carbon       0
Trihalomethanes     35
Turbidity            0
dtype: int64


In [None]:
# Fill missing values with mean
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Verify missing values are handled
print("Missing values after imputation in train data:")
print(train_data.isnull().sum())

print("\nMissing values after imputation in test data:")
print(test_data.isnull().sum())


Missing values after imputation in train data:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
dtype: int64

Missing values after imputation in test data:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
dtype: int64


In [None]:
# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)


NameError: name 'X_train_scaled' is not defined

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
