In [152]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/water_potability.csv')

In [132]:
features = data[['Hardness', 'Solids', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Potability']]
print(features.head())

     Hardness       Solids     Sulfate  Conductivity  Organic_carbon  \
0  204.890456  20791.31898  368.516441    564.308654       10.379783   
1  129.422921  18630.05786         NaN    592.885359       15.180013   
2  224.236259  19909.54173         NaN    418.606213       16.868637   
3  214.373394  22018.41744  356.886136    363.266516       18.436525   
4  181.101509  17978.98634  310.135738    398.410813       11.558279   

   Potability  
0           0  
1           0  
2           0  
3           0  
4           0  


In [133]:
train_data = data[data['Sulfate'].notnull()][['Hardness', 'Solids', 'Conductivity', 'Organic_carbon', 'Potability']]

print(train_data.head())

     Hardness       Solids  Conductivity  Organic_carbon  Potability
0  204.890456  20791.31898    564.308654       10.379783           0
3  214.373394  22018.41744    363.266516       18.436525           0
4  181.101509  17978.98634    398.410813       11.558279           0
5  188.313324  28748.68774    280.467916        8.399735           0
6  248.071735  28749.71654    283.651634       13.789695           0


In [134]:
train_label = data[data['Sulfate'].notnull()]['Sulfate']
print(train_label.head())

0    368.516441
3    356.886136
4    310.135738
5    326.678363
6    393.663395
Name: Sulfate, dtype: float64


In [135]:
#5
test_data = data[data['Sulfate'].isnull()][['Hardness', 'Solids', 'Conductivity', 'Organic_carbon', 'Potability']]
print(test_data.head())

      Hardness       Solids  Conductivity  Organic_carbon  Potability
1   129.422921  18630.05786    592.885359       15.180013           0
2   224.236259  19909.54173    418.606213       16.868637           0
11  218.693300  18767.65668    364.098230       14.525746           0
14  205.344982  28388.00489    444.645352       13.228311           0
16  211.049406  30980.60079    315.141267       20.397022           0


In [136]:
#6
min_values = train_data.min()
max_values = train_data.max()

scaler = MinMaxScaler(feature_range=(0,1))
train_data_normalized = scaler.fit_transform(train_data)

train_data_normalized = pd.DataFrame(train_data_normalized, columns=train_data.columns)
print(train_data_normalized.head())

print("\nMinimum values:")
print(min_values)
print("\nMaximum values:")
print(max_values)


   Hardness    Solids  Conductivity  Organic_carbon  Potability
0  0.583382  0.336096      0.657375        0.329741         0.0
1  0.618517  0.356244      0.292985        0.654522         0.0
2  0.495244  0.289922      0.356685        0.377248         0.0
3  0.521964  0.466746      0.142913        0.249922         0.0
4  0.743369  0.466763      0.148683        0.467200         0.0

Minimum values:
Hardness           47.432000
Solids            320.942611
Conductivity      201.619737
Organic_carbon      2.200000
Potability          0.000000
dtype: float64

Maximum values:
Hardness            317.338124
Solids            61227.196010
Conductivity        753.342620
Organic_carbon       27.006707
Potability            1.000000
dtype: float64


In [137]:
#7
min_values = {'Hardness': 80, 'Solids': 320.94, 'Conductivity': 181.48, 'Organic_carbon': 2.2, 'Potability': 0}
max_values = {'Hardness': 323, 'Solids': 61227, 'Conductivity': 753, 'Organic_carbon': 28.3, 'Potability': 1}

for column in test_data.columns:
    test_data[column] = (test_data[column] - min_values[column]) / (max_values[column] - min_values[column])

print(test_data.head())

    Hardness    Solids  Conductivity  Organic_carbon  Potability
1   0.203387  0.300612      0.719844        0.497319         0.0
2   0.593565  0.321620      0.414904        0.562017         0.0
11  0.570754  0.302872      0.319531        0.472251         0.0
14  0.515823  0.460825      0.460466        0.422541         0.0
16  0.539298  0.503393      0.233870        0.697204         0.0


In [138]:
#8
test_data_normalized = scaler.transform(test_data)

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(train_data_normalized, train_label)

class_result = knn.predict(test_data_normalized)

print(class_result)

[332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  332.4985678  332.4985678  332.4985678
 332.4985678  332.4985678  381.01892203 381.01892203 381.01892203
 381.01892203 381.01892203 381.01892203 381.01892203 381.01892203
 381.01892203 381.01892203 381.01892203 381.01892203 381.01892203
 381.01892203 381.0189



In [139]:
#9
data.loc[data['Sulfate'].isnull(), 'Sulfate'] = knn.predict(test_data_normalized)

print(data)

            ph    Hardness       Solids  Chloramines     Sulfate  \
0          NaN  204.890456  20791.31898     7.300212  368.516441   
1     3.716080  129.422921  18630.05786     6.635246  332.498568   
2     8.099124  224.236259  19909.54173     9.275884  332.498568   
3     8.316766  214.373394  22018.41744     8.059332  356.886136   
4     9.092223  181.101509  17978.98634     6.546600  310.135738   
...        ...         ...          ...          ...         ...   
3271  4.668102  193.681736  47580.99160     7.166639  359.948574   
3272  7.808856  193.553212  17329.80216     8.061362  381.018922   
3273  9.419510  175.762646  33155.57822     7.350233  381.018922   
3274  5.126763  230.603758  11983.86938     6.303357  381.018922   
3275  7.874671  195.102299  17404.17706     7.509306  381.018922   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       564.308654       10.379783        86.990970   2.963135           0  
1       592.885359       15.1



In [141]:
#10
X_train, X_test, y_train, y_test = train_test_split(train_data_normalized, train_label, test_size=0.2, random_state=42)

In [142]:
#11
train_data = X_train[['Hardness', 'Solids', 'Conductivity', 'Organic_carbon']]

print(train_data.head())

      Hardness    Solids  Conductivity  Organic_carbon
461   0.464724  0.234863      0.817290        0.322980
109   0.642507  0.352703      0.334939        0.638345
1951  0.511185  0.230231      0.376083        0.505558
354   0.579274  0.556677      0.387799        0.565212
266   0.524435  0.191187      0.546237        0.470296


In [143]:
#12
train_label = X_train[['Potability']]
print(train_label.head())

      Potability
461          0.0
109          0.0
1951         0.0
354          0.0
266          1.0


In [144]:
#13
test_data =X_test[['Hardness', 'Solids', 'Conductivity', 'Organic_carbon']]
print(test_data.head())

      Hardness    Solids  Conductivity  Organic_carbon
2290  0.559664  0.736014      0.408896        0.537360
1858  0.403461  0.297367      0.157202        0.389891
902   0.546811  0.280127      0.330193        0.360672
2236  0.607696  0.179179      0.586289        0.519963
1100  0.438609  0.094367      0.437115        0.359506


In [145]:
#14
test_label = X_test[['Potability']]

In [146]:
#15
min_values = train_data.min()
max_values = train_data.max()

scaler = MinMaxScaler(feature_range=(0,1))
train_data_normalized = scaler.fit_transform(train_data)

train_data_normalized = pd.DataFrame(train_data_normalized, columns=train_data.columns)
print(train_data_normalized.head())

print("\nMinimum values:")
print(min_values)
print("\nMaximum values:")
print(max_values)

   Hardness    Solids  Conductivity  Organic_carbon
0  0.407519  0.234863      0.817290        0.368329
1  0.604301  0.352703      0.334939        0.727975
2  0.458945  0.230231      0.376083        0.576544
3  0.534310  0.556677      0.387799        0.644573
4  0.473611  0.191187      0.546237        0.536330

Minimum values:
Hardness          0.096553
Solids            0.000000
Conductivity      0.000000
Organic_carbon    0.000000
dtype: float64

Maximum values:
Hardness          1.000000
Solids            1.000000
Conductivity      1.000000
Organic_carbon    0.876878
dtype: float64


In [148]:
#16
min_values = {'Hardness': 0.096553, 'Solids': 0.0, 'Conductivity': 0.0, 'Organic_carbon': 0.0}
max_values = {'Hardness': 1.0, 'Solids': 1.0, 'Conductivity': 1.0, 'Organic_carbon': 0.876878}

for column in test_data.columns:
    test_data[column] = (test_data[column] - min_values[column]) / (max_values[column] - min_values[column])

print(test_data.head())

      Hardness    Solids  Conductivity  Organic_carbon
2290  0.460515  0.736014      0.408896        0.698854
1858  0.269141  0.297367      0.157202        0.507066
902   0.444768  0.280127      0.330193        0.469066
2236  0.519362  0.179179      0.586289        0.676230
1100  0.312204  0.094367      0.437115        0.467550


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[column] = (test_data[column] - min_values[column]) / (max_values[column] - min_values[column])


In [153]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, train_label)

class_result = knn.predict(X_test)

print(class_result)

[0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1.
 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.
 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0.

  return self._fit(X, y)


In [155]:
error_count = sum(class_result != test_label)

print("Jumlah kesalahan:", error_count)

  error_count = sum(class_result != test_label)


ValueError: Unable to coerce to Series, length must be 1: given 499