In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
file_path = 'water_potability.csv'  
df = pd.read_csv(file_path)
data = df.copy()
print(data.dtypes)
data=data.dropna()
data = data.reset_index(drop=True)

y = data['Potability']

variable_list = data.columns.tolist()
print(variable_list)
variable_list.remove('Potability')
print(variable_list)

numeric_data = data[variable_list]

# Initialize StandardScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numeric_data)
df_scaled = pd.DataFrame(scaled_data, columns=variable_list)

df_scaled['Potability'] = y

print(df_scaled)

unique_counts = df_scaled['Potability'].value_counts()
print("Number of occurrences for each unique value in 'target' column:")
print(unique_counts)

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object
['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']
['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
            ph  Hardness    Solids  Chloramines   Sulfate  Conductivity  \
0     0.587349  0.577747  0.386298     0.568199  0.647347      0.292985   
1     0.643654  0.441300  0.314381     0.439304  0.514545      0.356685   
2     0.388934  0.470876  0.506122     0.524364  0.561537      0.142913   
3     0.725820  0.715942  0.506141     0.521683  0.751819      0.148683   
4     0.610517  0.532588  0.237701     0.270288  0.495155      0.494792   
...       

In [4]:
num_samples_per_class = 700

# Filter rows with diabetes value 0 and 1
diabetes_class_0 = df_scaled[df_scaled['Potability'] == 0].sample(n=num_samples_per_class, random_state=42)
diabetes_class_1 = df_scaled[df_scaled['Potability'] == 1].sample(n=num_samples_per_class, random_state=42)

# Concatenate the samples from both classes
random_sample = pd.concat([diabetes_class_0, diabetes_class_1], axis=0)

# Shuffle the rows to mix the classes
random_sample = random_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
X = random_sample.drop('Potability', axis=1)
y = random_sample['Potability']

In [8]:
list= ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

for l in list:
    print(l)
    #file_path = 'age_unique_values_counts.csv'  
    df = X[l].copy()
    # Create DataFrame
    df = pd.DataFrame(df)

    # Define bins for the intervals
    bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]

    # Create intervals and sum 'Count' values within each interval
    df['Interval'] = pd.cut(df[l], bins=bins, include_lowest=True)

    # Group by 'Interval' and count the number of occurrences
    bin_counts = df.groupby('Interval').size().reset_index(name='Count')

    # Sort bins by interval
    bin_counts['Interval'] = bin_counts['Interval'].astype(str)
    bin_counts = bin_counts.sort_values(by='Interval')

    print(bin_counts)

    ordered_df = df.groupby('Interval')
    print(ordered_df)

ph
          Interval  Count
0   (-0.001, 0.05]      1
1      (0.05, 0.1]      2
2      (0.1, 0.15]      1
3      (0.15, 0.2]      7
4      (0.2, 0.25]     10
5      (0.25, 0.3]     31
6      (0.3, 0.35]     71
7      (0.35, 0.4]    133
8      (0.4, 0.45]    195
9      (0.45, 0.5]    286
10     (0.5, 0.55]    240
11     (0.55, 0.6]    182
12     (0.6, 0.65]    119
13     (0.65, 0.7]     62
14     (0.7, 0.75]     39
15     (0.75, 0.8]     14
16     (0.8, 0.85]      6
17     (0.85, 0.9]      0
18     (0.9, 0.95]      0
19     (0.95, 1.0]      1
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000215A5579FA0>
Hardness
          Interval  Count
0   (-0.001, 0.05]      3
1      (0.05, 0.1]      3
2      (0.1, 0.15]      5
3      (0.15, 0.2]     19
4      (0.2, 0.25]     24
5      (0.25, 0.3]     36
6      (0.3, 0.35]     87
7      (0.35, 0.4]    124
8      (0.4, 0.45]    163
9      (0.45, 0.5]    202
10     (0.5, 0.55]    235
11     (0.55, 0.6]    180
12     (0.6, 0.65]    143
13

In [9]:
true_labels = y

def calculate_purity_and_sizes(cluster_labels, true_labels):
    cluster_purities = []
    cluster_sizes = []
    class_counts = []
    for cluster_label in np.unique(cluster_labels):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_X = X.iloc[cluster_indices]
        cluster_labels_true = true_labels.iloc[cluster_indices]
        label_counts = np.bincount(cluster_labels_true)
        majority_label_count = np.max(label_counts)
        cluster_purity = majority_label_count / len(cluster_labels_true)
        cluster_purities.append(cluster_purity)
        cluster_sizes.append(len(cluster_indices))
        class_counts.append(label_counts)
    overall_purity = np.mean(cluster_purities)
    return overall_purity, cluster_purities, cluster_sizes, class_counts

# Perform KMeans clustering for 3, 4, and 5 clusters
cluster_numbers = [3, 4, 5, 6, 7, 8, 9]
overall_purities = []
cluster_purities_iterations = []
cluster_sizes_iterations = []
class_counts_iterations = []

for n_clusters in cluster_numbers:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)

    # Calculate purity, sizes, and class counts for each cluster
    overall_purity, cluster_purities, cluster_sizes, class_counts = calculate_purity_and_sizes(cluster_labels, true_labels)
    overall_purities.append(overall_purity)
    cluster_purities_iterations.append(cluster_purities)
    cluster_sizes_iterations.append(cluster_sizes)
    class_counts_iterations.append(class_counts)

for i, n_clusters in enumerate(cluster_numbers):
    print(f"Overall Purity for {n_clusters} clusters: {overall_purities[i]:.2f}")
    for cluster_label, (purity, size, class_count) in enumerate(zip(cluster_purities_iterations[i], cluster_sizes_iterations[i], class_counts_iterations[i])):
        class_0_count = class_count[0] if len(class_count) > 0 else 0
        class_1_count = class_count[1] if len(class_count) > 1 else 0
        print(f"{cluster_label}, {purity:.2f}, {size}, {class_0_count}/{class_1_count}")

Overall Purity for 3 clusters: 0.52
0, 0.52, 503, 262/241
1, 0.50, 501, 248/253
2, 0.52, 396, 190/206
Overall Purity for 4 clusters: 0.52
0, 0.50, 372, 185/187
1, 0.51, 364, 184/180
2, 0.54, 310, 142/168
3, 0.53, 354, 189/165
Overall Purity for 5 clusters: 0.53
0, 0.53, 309, 146/163
1, 0.52, 298, 155/143
2, 0.56, 266, 118/148
3, 0.54, 286, 154/132
4, 0.53, 241, 127/114
Overall Purity for 6 clusters: 0.52
0, 0.51, 254, 124/130
1, 0.50, 271, 135/136
2, 0.54, 249, 115/134
3, 0.53, 204, 109/95
4, 0.55, 211, 115/96
5, 0.52, 211, 102/109
Overall Purity for 7 clusters: 0.53
0, 0.51, 220, 113/107
1, 0.54, 204, 93/111
2, 0.51, 202, 98/104
3, 0.51, 177, 91/86
4, 0.55, 209, 115/94
5, 0.51, 191, 98/93
6, 0.53, 197, 92/105
Overall Purity for 8 clusters: 0.54
0, 0.54, 154, 71/83
1, 0.51, 168, 82/86
2, 0.55, 166, 91/75
3, 0.54, 250, 134/116
4, 0.51, 156, 79/77
5, 0.57, 179, 77/102
6, 0.58, 148, 86/62
7, 0.55, 179, 80/99
Overall Purity for 9 clusters: 0.54
0, 0.54, 147, 68/79
1, 0.51, 188, 96/92
2, 0.

found 0 physical cores < 1
  File "C:\Users\katha\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [10]:
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(X)

X_8 = X.copy()
X_8['cluster'] = cluster_labels
print(X_8)

cluster_1_df = X_8[X_8['cluster'] == 1][['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
cluster_4_df = X_8[X_8['cluster'] == 4][['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]

            ph  Hardness    Solids  Chloramines   Sulfate  Conductivity  \
0     0.251547  0.529104  0.162350     0.762837  0.523076      0.619996   
1     0.655317  0.387739  0.889449     0.384640  0.813710      0.359572   
2     0.591670  0.716193  0.489663     0.486502  0.608066      0.420193   
3     0.339734  0.411272  0.257536     0.379417  0.393605      0.758945   
4     0.518600  0.336904  0.337604     0.344590  0.769651      0.343840   
...        ...       ...       ...          ...       ...           ...   
1395  0.350716  0.263269  0.194208     0.427785  1.000000      0.685988   
1396  0.665504  0.580527  0.360907     0.741187  0.574776      0.444888   
1397  0.390715  0.813588  0.771961     0.235722  0.675698      0.451814   
1398  0.490666  0.245811  0.384984     0.612364  0.602902      0.606568   
1399  0.430879  0.530108  0.212020     0.863379  0.554066      0.258728   

      Organic_carbon  Trihalomethanes  Turbidity  cluster  
0           0.533573         0.490614  

In [15]:
list= ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

for l in list:
    print(l)
    #file_path = 'age_unique_values_counts.csv'  
    df = cluster_4_df[l].copy()
    # Create DataFrame
    df = pd.DataFrame(df)

    # Define bins for the intervals
    bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]

    # Create intervals and sum 'Count' values within each interval
    df['Interval'] = pd.cut(df[l], bins=bins, include_lowest=True)

    # Group by 'Interval' and count the number of occurrences
    bin_counts = df.groupby('Interval').size().reset_index(name='Count')

    # Sort bins by interval
    bin_counts['Interval'] = bin_counts['Interval'].astype(str)
    bin_counts = bin_counts.sort_values(by='Interval')

    print(bin_counts)

    ordered_df = df.groupby('Interval')
    print(ordered_df)

ph
          Interval  Count
0   (-0.001, 0.05]      0
1      (0.05, 0.1]      0
2      (0.1, 0.15]      0
3      (0.15, 0.2]      0
4      (0.2, 0.25]      0
5      (0.25, 0.3]      2
6      (0.3, 0.35]     11
7      (0.35, 0.4]     15
8      (0.4, 0.45]     22
9      (0.45, 0.5]     53
10     (0.5, 0.55]     41
11     (0.55, 0.6]     41
12     (0.6, 0.65]     25
13     (0.65, 0.7]     16
14     (0.7, 0.75]     11
15     (0.75, 0.8]      2
16     (0.8, 0.85]      1
17     (0.85, 0.9]      0
18     (0.9, 0.95]      0
19     (0.95, 1.0]      1
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000215A556DEB0>
Hardness
          Interval  Count
0   (-0.001, 0.05]      0
1      (0.05, 0.1]      0
2      (0.1, 0.15]      0
3      (0.15, 0.2]      0
4      (0.2, 0.25]      0
5      (0.25, 0.3]      3
6      (0.3, 0.35]      3
7      (0.35, 0.4]      6
8      (0.4, 0.45]     22
9      (0.45, 0.5]     24
10     (0.5, 0.55]     42
11     (0.55, 0.6]     40
12     (0.6, 0.65]     37
13