In [1]:
import numpy as np 
import pandas as pd 

In [2]:
from sklearn.model_selection import train_test_split

# Load iris data
water_data = pd.read_csv('water_potability.csv')
water_data.dropna(inplace=True)

# Extract features (X) and target labels (y)
y = water_data['Potability'].values
X = water_data.drop('Potability', axis=1).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets to verify the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1608, 9)
Shape of X_test: (403, 9)
Shape of y_train: (1608,)
Shape of y_test: (403,)


In [3]:
from sklearn import datasets

iris = datasets.load_iris()
y = iris.target
X = iris.data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(type(X_train))
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


<class 'numpy.ndarray'>
Shape of X_train: (120, 4)
Shape of X_test: (30, 4)
Shape of y_train: (120,)
Shape of y_test: (30,)


## Training a simple Decision Tree model to predict water potability

In [4]:
from dt_exer import DecisionTreeClassifier

# Instantiate the classifier
clf = DecisionTreeClassifier(max_depth = 2)
print("Made classifier")

# Train the classifier
print("fitting....")
clf.fit(X_train, y_train)



# Make predictions on the test data
print("predicting....") 
y_pred = clf.predict(X_test)



# Compare the predicted values to the actual labels and calculate f1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

Predictions: [1, 1, 0]
Made classifier
fitting....
predicting....
F1 Score: 0.9664109121909632


### Results of this basic first test:
- A single decision tree has a 0.52 f1 score
- Takes around 1 minute to train the decision tree
  - This is probably due to the fact that there are multiple features to search through and the class
    that I made from scratch is not optimized

## Training a Random Forest model to predict water potability

In [5]:
water_data_df = pd.read_csv('water_potability.csv')
water_data_df.dropna(inplace=True)

# Extract features (X) and target labels (y)
y = water_data_df['Potability']
X = water_data_df.drop('Potability', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets to verify the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1608, 9)
Shape of X_test: (403, 9)
Shape of y_train: (1608,)
Shape of y_test: (403,)


In [6]:
from sklearn import datasets

iris = datasets.load_iris()
y = iris.target
X = iris.data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(type(X_train))
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


<class 'numpy.ndarray'>
Shape of X_train: (120, 4)
Shape of X_test: (30, 4)
Shape of y_train: (120,)
Shape of y_test: (30,)


In [7]:
from rf import RandomForest

print("Making classifier")
# Instantiate the classifier
clf = RandomForest(max_depth = 1, max_features = 3, n_trees = 20)

print("fitting....")
# Train the classifier
clf.fit(X_train, y_train)

print("predicting....") 
# Make predictions on the test data
y_pred = clf.predict(X_test)


print("Calculating Accuracy...")
# Compare the predicted values to the actual labels and calculate f1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

Making classifier
fitting....
row index:  [ 20  45  41  99  42  79  70 111   2  31  72  48  72  74  48  60  12  75
  63  41 114  93  34  25  29  18  21  15  70  81  34  22  36  94  32 107
   3  63 111   5   7  72  10  96  65  82 103  11  52   3 106  73  36  66
  73  56   3 104  97  30  30   0  76   5  51  78 118  32  48  66 104  82
   3  76 104  65 119  16 109 116 109 111  92 102  79  30  79  26  91   0
 115  57  51 109  38  48  30 101 101  54   9  92 108  71  20   8  91 110
 115  89  56  52  41   2 106  94  87  51  37  42]
feature index:  [1 2 3]
[[2.9 4.2 1.3]
 [3.  4.5 1.5]
 [4.2 1.4 0.2]
 [2.4 3.3 1. ]
 [3.  4.8 1.8]
 [2.9 4.7 1.4]
 [3.1 1.5 0.2]
 [2.9 4.3 1.3]
 [3.1 4.4 1.4]
 [3.4 1.9 0.2]
 [3.  1.3 0.2]
 [3.9 1.7 0.4]
 [3.  1.3 0.2]
 [3.4 5.6 2.4]
 [3.9 1.7 0.4]
 [2.5 3.  1.1]
 [3.1 4.7 1.5]
 [3.4 1.4 0.3]
 [2.6 5.6 1.4]
 [4.2 1.4 0.2]
 [3.4 1.7 0.2]
 [2.6 4.4 1.2]
 [2.7 4.2 1.3]
 [2.7 4.1 1. ]
 [2.  3.5 1. ]
 [3.  4.4 1.4]
 [3.  6.6 2.1]
 [2.4 3.7 1. ]
 [3.1 1.5 0.2]
 [2.8 4.8 1