Attributes are formatted as
 1  Q-E        (input flow to plant)  
 2  ZN-E       (input Zinc to plant)
 3  PH-E       (input pH to plant) 
 4  DBO-E      (input Biological demand of oxygen to plant) 
 5  DQO-E      (input chemical demand of oxygen to plant)
 6  SS-E       (input suspended solids to plant)  
 7  SSV-E      (input volatile supended solids to plant)
 8  SED-E      (input sediments to plant) 
 9  COND-E     (input conductivity to plant) 
10  PH-P       (input pH to primary settler)
11  DBO-P      (input Biological demand of oxygen to primary settler)
12  SS-P       (input suspended solids to primary settler)
13  SSV-P      (input volatile supended solids to primary settler)
14  SED-P      (input sediments to primary settler) 
15  COND-P     (input conductivity to primary settler)
16  PH-D       (input pH to secondary settler) 
17  DBO-D      (input Biological demand of oxygen to secondary settler)
18  DQO-D      (input chemical demand of oxygen to secondary settler)
19  SS-D       (input suspended solids to secondary settler)
20  SSV-D      (input volatile supended solids to secondary settler)
21  SED-D      (input sediments to secondary settler)  
22  COND-D     (input conductivity to secondary settler) 
23  PH-S       (output pH)   
24  DBO-S      (output Biological demand of oxygen)
25  DQO-S      (output chemical demand of oxygen)
26  SS-S       (output suspended solids)
27  SSV-S      (output volatile supended solids) 
28  SED-S      (output sediments) 
29  COND-S     (output conductivity)
30  RD-DBO-P   (performance input Biological demand of oxygen in primary settler)
31  RD-SS-P    (performance input suspended solids to primary settler)
32  RD-SED-P   (performance input sediments to primary settler)
33  RD-DBO-S   (performance input Biological demand of oxygen to secondary settler)
34  RD-DQO-S   (performance input chemical demand of oxygen to secondary settler)
35  RD-DBO-G   (global performance input Biological demand of oxygen)
36  RD-DQO-G   (global performance input chemical demand of oxygen)
37  RD-SS-G    (global performance input suspended solids) 
38  RD-SED-G   (global performance input sediments)

In [239]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# find the csv file
csv_file = 'water-treatment.csv'

# read the file into a data frame
df = pd.read_csv(csv_file, engine='python', on_bad_lines='skip')
# prevent errors when reading a '?' (because its not a valid number)
# all ?s are replaced with -1
df = df.replace("?", -1)
df = df.replace('?', -1)
df = df.apply(pd.to_numeric, errors='ignore')
print(df.head())

       date    Q-E  ZN-E  PH-E  DBO-E  DQO-E  SS-E  SSV-E  SED-E  COND-E  ...  \
0  D-1/3/90  44101   1.5   7.8     -1    407   166   66.3    4.5    2110  ...   
1  D-2/3/90  39024   3.0   7.7     -1    443   214   69.2    6.5    2660  ...   
2  D-4/3/90  32229   5.0   7.6     -1    528   186   69.9    3.4    1666  ...   
3  D-5/3/90  35023   3.5   7.9    205    588   192   65.6    4.5    2430  ...   
4  D-6/3/90  36924   1.5   8.0    242    496   176   64.8    4.0    2110  ...   

   COND-S  RD-DBO-P  RD-SS-P  RD-SED-P  RD-DBO-S  RD-DQO-S  RD-DBO-G  \
0    2000      -1.0     58.8      95.5      -1.0      70.0      -1.0   
1    2590      -1.0     60.7      94.8      -1.0      80.8      -1.0   
2    1888      -1.0     58.2      95.6      -1.0      52.9      -1.0   
3    1840      33.1     64.2      95.3      87.3      72.3      90.2   
4    2120      -1.0     62.7      95.6      -1.0      71.0      92.1   

   RD-DQO-G  RD-SS-G  RD-SED-G  
0      79.4     87.3      99.6  
1      79.5   

  df = df.apply(pd.to_numeric, errors='ignore')


In [240]:
# add a new column called 'SAFE-PH-S' in df
# it should be "yes" if the pH is within the range of 6.5 to 8.5, and "no" otherwise
def classify_ph(ph_value):
    if ph_value >= 6.5 and ph_value <= 8.5:
        return 'yes'
    else:
        return 'no'
df['SAFE-PH-S'] = df.iloc[:, 23].apply(classify_ph)
print(df.iloc[:, 23])
print(df["SAFE-PH-S"].value_counts())

0      7.3
1      7.5
2      7.6
3      7.6
4      7.6
      ... 
522    7.9
523    7.9
524    7.7
525    7.7
526    7.3
Name: PH-S, Length: 527, dtype: float64
SAFE-PH-S
yes    525
no       2
Name: count, dtype: int64


In [241]:
# compare column 3 with column 23, 4 with 24, etc
pairs = [ (2, 22),
         (3, 23),
         (4, 24),
         (5, 25), 
         (6, 26),
         (7, 27), 
         (8, 28)
]

In [242]:
# turn it into 1 data frame
combined_rows = []
for x_col, y_col in pairs:
    temp = pd.DataFrame({ 'input': df.iloc[:, x_col], 'output': df.iloc[:, y_col] })
    combined_rows.append(temp)
combined_df = pd.concat(combined_rows, ignore_index=True)
print(combined_df)

      input   output
0       1.5  2010.00
1       3.0  2700.00
2       5.0  1742.00
3       3.5  2060.00
4       1.5  2250.00
...     ...      ...
3684    2.3     0.01
3685    4.0     0.00
3686    3.0     0.02
3687    4.0     0.01
3688   -1.0    -1.00

[3689 rows x 2 columns]


In [243]:
# find correlation
correlation = combined_df.corr()
print(correlation)
# get training parameters
x = df.iloc[:, [22]]
y = df["SAFE-PH-S"]
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42 , stratify=y)

           input    output
input   1.000000 -0.273159
output -0.273159  1.000000


In [244]:
# train model 1
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model1accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model1report = classification_report(y_test, y_pred)
scores1 = cross_val_score(model1, x, y, cv=5, scoring="accuracy")
print(scores1)

Accuracy: 1.0
              precision    recall  f1-score   support

         yes       1.00      1.00      1.00       106

    accuracy                           1.00       106
   macro avg       1.00      1.00      1.00       106
weighted avg       1.00      1.00      1.00       106

[0.99056604 0.98113208 1.         1.         1.        ]




In [245]:
# train model 2
model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model2accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model2report = classification_report(y_test, y_pred)
scores2 = cross_val_score(model1, x, y, cv=5, scoring="accuracy")
print(scores2)

Accuracy: 1.0
              precision    recall  f1-score   support

         yes       1.00      1.00      1.00       106

    accuracy                           1.00       106
   macro avg       1.00      1.00      1.00       106
weighted avg       1.00      1.00      1.00       106

[0.99056604 0.98113208 1.         1.         1.        ]




In [246]:
# output the results to output.txt
with open("output.txt", "w") as f:
    f.write(f"Model 1:\n")
    f.write(f"Accuracy: {model1accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model1report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores1}\n\n")
    f.write("Model 2:\n")
    f.write(f"Accuracy: {model2accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model2report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores2}\n\n")