## Load the Data

In [190]:
import pandas as pd 
import os

# function to load data
def load_data(data_path, file_name):
    csv_path = os.path.join(data_path, file_name)
    return pd.read_csv(csv_path, low_memory=False)

ocean = load_data("data", "Data_Level5_BAH_OceanCleanup.csv")

In [202]:
ocean['Cleanup Type'].value_counts()

Land (beach, shoreline and inland)                  36923
Watercraft (powerboat, sailboat, kayak or canoe)      560
Underwater                                            182
Name: Cleanup Type, dtype: int64

## Seperate Training and the Test Set

In [193]:
# drop nan
ocean.dropna(subset=["Cleanup Type"], inplace=True)
ocean.dropna(subset=["Zone"], inplace=True)

# seperate labels and predicators
X = ocean.drop(['Zone', 'Cleanup ID', 'State', 'Country', 'GPS', 'Cleanup Type',
                       'Cleanup Date', 'Group Name'], axis=1) 
y = ocean["Cleanup Type"].copy()

In [194]:
from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(len(X_train))
print(len(X_test))

print(len(y_train))
print(len(y_test))

30132
7533
30132
7533


## ML Pipeline

In [195]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# numerical values pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])


# prepare the data
X_train = num_pipeline.fit_transform(X_train)
X_test = num_pipeline.transform(X_test)

In [196]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

## Train Models and Evaluate on the Training Set¶

In [233]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# function to print out classification model report
def classification_report(model_name, test, pred, label="1"):
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.4f}'.format(accuracy_score(test, pred)))
    print("     Precision: ", '{:,.4f}'.format(precision_score(test, pred, pos_label=label, average='weighted')))
    print("        Recall: ", '{:,.4f}'.format(recall_score(test, pred, pos_label=label, average='weighted')))
    print("      F1 score: ", '{:,.4f}'.format(f1_score(test, pred, pos_label=label, average='weighted')))


## Multiclass Classification with KNN classifier

In [234]:
from sklearn.neighbors import KNeighborsClassifier

knnc = KNeighborsClassifier(weights='distance', n_neighbors=4)
knnc.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='distance')

In [235]:
y_knn_pred = knnc.predict(X_test)
classification_report("Test data - KNN classifier report, Clean-up Type", y_test, y_knn_pred, "Underwater")

Test data - KNN classifier report, Clean-up Type :

Accuracy Score:  0.9796
     Precision:  0.9707
        Recall:  0.9796
      F1 score:  0.9727


