In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# find the csv file
csv_file = 'WaterAtlas-ManySites.csv'

# read the file into a data frame
df = pd.read_csv(csv_file, engine='python', on_bad_lines='skip')
print(df.head())

                                  DataSourceName  DataSourceCode  \
0  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
1  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
2  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
3  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
4  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   

           StationID ActualStationID  Latitude_DD  Longitude_DD  \
0  Bugg Springs-Lake             NaN     28.75361     -81.90444   
1  Bugg Springs-Lake             NaN     28.75361     -81.90444   
2  Bugg Springs-Lake             NaN     28.75361     -81.90444   
3  Bugg Springs-Lake             NaN     28.75361     -81.90444   
4  Bugg Springs-Lake             NaN     28.75361     -81.90444   

                SampleDate SampleTime  ActivityDepth ActivityDepthUnit  \
0  1991-08-18 00:00:00.000   00:00:00            NaN               NaN   
1  1991-08-18 00:00:00.000   00:00:00   

In [2]:
data_ph = []
# filter df for ph data
for index, row in df.iterrows():
    if row['Characteristic'] == 'pH':
        data_ph.append(row)
# create a new data frame for ph data
df_ph = pd.DataFrame(data_ph)
print(df_ph.head())

                                   DataSourceName  DataSourceCode  \
0   LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
13  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
40  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
59  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
70  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   

            StationID ActualStationID  Latitude_DD  Longitude_DD  \
0   Bugg Springs-Lake             NaN     28.75361     -81.90444   
13  Bugg Springs-Lake             NaN     28.75361     -81.90444   
40  Bugg Springs-Lake             NaN     28.75361     -81.90444   
59        Church-Lake             NaN     28.64625     -81.84342   
70        Turkey-Lake             NaN     28.70128     -81.85039   

                 SampleDate SampleTime  ActivityDepth ActivityDepthUnit  \
0   1991-08-18 00:00:00.000   00:00:00            NaN               NaN   
13  1991-03-10 00:00:00.00

In [None]:
# add a new column called 'SAFE-PH' in df_ph
# it should be "yes" if the pH is within the range of 6.5 to 8.5, and "no" otherwise
def classify_ph(ph_value):
    if ph_value >= 6.5 and ph_value <= 8.5:
        return "yes"
    else:
        return "no"
df_ph['SAFE-PH'] = df_ph['ResultValue'].apply(classify_ph)
print(df_ph.head())

# print filtered output to a new csv file
df.to_csv("filtered_output.csv", index=False)

                                   DataSourceName  DataSourceCode  \
0   LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
13  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
40  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
59  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   
70  LAKEWATCH Supplemental Water Quality Sampling  LAKEWATCH_SUPP   

            StationID ActualStationID  Latitude_DD  Longitude_DD  \
0   Bugg Springs-Lake             NaN     28.75361     -81.90444   
13  Bugg Springs-Lake             NaN     28.75361     -81.90444   
40  Bugg Springs-Lake             NaN     28.75361     -81.90444   
59        Church-Lake             NaN     28.64625     -81.84342   
70        Turkey-Lake             NaN     28.70128     -81.85039   

                 SampleDate SampleTime  ActivityDepth ActivityDepthUnit  \
0   1991-08-18 00:00:00.000   00:00:00            NaN               NaN   
13  1991-03-10 00:00:00.00

In [4]:
# get training parameters
x = df_ph[['ResultValue']]
y = df_ph['SAFE-PH']
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42 )

In [5]:
# train model 1
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model1accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model1report = classification_report(y_test, y_pred)
scores1 = cross_val_score(model1, x, y, cv=5, scoring="accuracy")
print(scores1)

Accuracy: 1.0
              precision    recall  f1-score   support

          no       1.00      1.00      1.00        15
         yes       1.00      1.00      1.00        80

    accuracy                           1.00        95
   macro avg       1.00      1.00      1.00        95
weighted avg       1.00      1.00      1.00        95

[1.        1.        0.9893617 0.9787234 1.       ]


In [6]:
# train model 2
model2 = RandomForestClassifier()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model2accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model2report = classification_report(y_test, y_pred)
scores2 = cross_val_score(model1, x, y, cv=5, scoring="accuracy")
print(scores2)

Accuracy: 1.0
              precision    recall  f1-score   support

          no       1.00      1.00      1.00        15
         yes       1.00      1.00      1.00        80

    accuracy                           1.00        95
   macro avg       1.00      1.00      1.00        95
weighted avg       1.00      1.00      1.00        95

[1.        1.        0.9893617 0.9787234 1.       ]


In [7]:
# output the results to output.txt
with open("output.txt", "w") as f:
    f.write(f"Model 1:\n")
    f.write(f"Accuracy: {model1accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model1report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores1}\n\n")
    f.write("Model 2:\n")
    f.write(f"Accuracy: {model2accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model2report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores2}\n\n")