In [1]:
# Import necessary Python, sklearn and/or tensorflow/keras modules for loading the dataset
import os
import pandas as pd
import numpy as np


#Load the data
DATA_FILE = 'Data_Level5_BAH_OceanCleanup.csv'
if not os.path.exists(DATA_FILE):
    raise Exception('Data file not found. Make sure that the file is located in the same directory as the notebook')

df = pd.read_csv(DATA_FILE, sep=',', header=0, index_col=False)

# Basic overview of data shape, size, and type
df.info()

# Print data shape via built-in methods of sklearn, pandas or tensorflow/keras (or other modules)
print('\nDataframe shape: ', df.shape)

df.iloc[df['Miles'].argmax()]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37904 entries, 0 to 37903
Data columns (total 61 columns):
Cleanup ID                                   37904 non-null object
Zone                                         37665 non-null object
State                                        37577 non-null object
Country                                      37903 non-null object
GPS                                          37903 non-null object
Cleanup Type                                 37903 non-null object
Cleanup Date                                 37903 non-null object
Group Name                                   26636 non-null object
Adults                                       37903 non-null float64
Children                                     37903 non-null float64
People                                       37903 non-null float64
Pounds                                       37903 non-null float64
Miles                                        37903 non-null float64
# of bags      

  interactivity=interactivity, compiler=compiler, result=result)
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


Cleanup ID                                              71463
Zone                              Santa Clara County, CA, USA
State                                         California, USA
Country                                         United States
GPS                                          37.422, -122.084
                                             ...             
Personal Hygiene (Clean Swell)                              0
Foam Pieces                                                 0
Glass Pieces                                                0
Plastic Pieces                                              0
Total Items Collected                                       1
Name: 29152, Length: 61, dtype: object

In [2]:
for col in df:
    print(df[col].describe(), '\n\n')

count     37904
unique    37904
top       71820
freq          1
Name: Cleanup ID, dtype: int64 


count                           37665
unique                            900
top       Los Angeles County, CA, USA
freq                             2499
Name: Zone, dtype: object 


count            37577
unique              53
top       Florida, USA
freq             11319
Name: State, dtype: object 


count             37903
unique                1
top       United States
freq              37903
Name: Country, dtype: object 


count                   37903
unique                  33030
top       27.73249, -82.74755
freq                      218
Name: GPS, dtype: object 


count                                  37903
unique                                     3
top       Land (beach, shoreline and inland)
freq                                   37116
Name: Cleanup Type, dtype: object 


count         37903
unique         1730
top       9/15/2018
freq           3152
Name: Cleanup Date, dtype:

In [13]:
trash_types = df.loc[:, 'Cigarette Butts':'Plastic Pieces']  # isolate trash types
trash_types['Max'] = trash_types.loc[:].idxmax(axis=1)  # find most frequent trash item

cl_df = df.drop(df.loc[:,'Cigarette Butts':'Plastic Pieces'].head(0).columns, axis=1)  # separate trash classifier df
cl_df['most_freq_trash'] = trash_types['Max']

cl_df.head()
print(len(trash_types.columns) - 1)

46


In [16]:
#  Used sklearn libraries

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

In [5]:
import numpy as np

# Get county and state separately
cl_df['Zone'] = df['Zone'].str.split(',', n = 1, expand=True)
cl_df['State'] = df['State'].str.split(',', n = 1, expand=True)

# Change format of 'date' from mm/dd/yyyy to yyyy and rename to 'year' 
cl_df['Cleanup Date'] = df['Cleanup Date'].str.split('/', n = 2, expand=False)
cl_df['year'] = cl_df['Cleanup Date'].str[2].astype('float')
#cl_df = cl_df[cl_df['year'] < 1990]

# Get GPS x and y
cl_df['GPS'] = cl_df['GPS'].str.split(',', n=2, expand=False)
cl_df['gps_y'] = cl_df['GPS'].str[0].astype('float')
cl_df['gps_x'] = cl_df['GPS'].str[1].astype('float')

cl_df.head()
#cl_df.info()

Unnamed: 0,Cleanup ID,Zone,State,Country,GPS,Cleanup Type,Cleanup Date,Group Name,Adults,Children,People,Pounds,Miles,# of bags,Total Items Collected,most_freq_trash,year,gps_y,gps_x
0,86,Cuyahoga County,Ohio,United States,"[41.49742, -81.56505]","Land (beach, shoreline and inland)","[12, 22, 2014]",,1.0,0.0,1.0,0.04,0.009,0.0,3.0,"Straws, Stirrers",2014.0,41.49742,-81.56505
1,87,Fairfax County,Virginia,United States,"[38.87456, -77.24428]","Land (beach, shoreline and inland)","[12, 24, 2014]",,1.0,0.0,1.0,0.73,1.3903,0.0,10.0,Bottle Caps (Plastic),2014.0,38.87456,-77.24428
2,92,Washington,District of Columbia,United States,"[38.90733, -77.04405]","Land (beach, shoreline and inland)","[2, 19, 2015]",,1.0,0.0,1.0,0.63,0.0214,0.0,4.0,Bottle Caps (Plastic),2015.0,38.90733,-77.04405
3,93,Washington,District of Columbia,United States,"[38.90733, -77.0441]","Land (beach, shoreline and inland)","[2, 19, 2015]",,1.0,0.0,1.0,0.06,0.0302,0.0,14.0,Bottle Caps (Plastic),2015.0,38.90733,-77.0441
4,94,Washington,District of Columbia,United States,"[38.90733, -77.0441]","Land (beach, shoreline and inland)","[2, 19, 2015]",,1.0,0.0,1.0,0.06,0.0302,0.0,14.0,Bottle Caps (Plastic),2015.0,38.90733,-77.0441


In [6]:
# Drop poor features
cl_df.drop(['Cleanup ID', 
            'GPS',
            'Group Name',
            'Cleanup Date',
            'Country'], 
             axis=1,
             inplace=True)


for col in cl_df:
    cl_df.dropna(subset=[col], inplace=True)

In [7]:
X = cl_df.drop(columns=['most_freq_trash'])
y = cl_df['most_freq_trash']

cat_df_list = list(X.select_dtypes(include=['object']))
num_df_list = list(X.select_dtypes(include=['float64', 'int64']))
cl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37509 entries, 0 to 37902
Data columns (total 14 columns):
Zone                     37509 non-null object
State                    37509 non-null object
Cleanup Type             37509 non-null object
Adults                   37509 non-null float64
Children                 37509 non-null float64
People                   37509 non-null float64
Pounds                   37509 non-null float64
Miles                    37509 non-null float64
# of bags                37509 non-null float64
Total Items Collected    37509 non-null float64
most_freq_trash          37509 non-null object
year                     37509 non-null float64
gps_y                    37509 non-null float64
gps_x                    37509 non-null float64
dtypes: float64(10), object(4)
memory usage: 4.3+ MB


In [8]:
# pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
    
])

# pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder()),
])

X_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, cat_df_list),
    ('num', num_pipeline, num_df_list),
])

In [17]:
# Split to train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prep data with pipeline
X_prepared = X_pipeline.fit_transform(X)  # Whole set ran through pipeline for cross-val
X_train_prepared = X_pipeline.transform(X_train)
X_test_prepared = X_pipeline.transform(X_test)

oe = OrdinalEncoder()
oe.fit_transform(y)

y.head()

ValueError: Expected 2D array, got 1D array instead:
array=['Straws, Stirrers' 'Bottle Caps (Plastic)' 'Bottle Caps (Plastic)' ...
 'Cigarette Butts' 'Plastic Pieces' 'Cigarette Butts'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [15]:
import tensorflow as tf
from sklearn import metrics

def build_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(50, input_dim=X_train_prepared.shape[1], activation = 'relu', activity_regularizer=tf.keras.regularizers.l1(0.001)))
    model.add(tf.keras.layers.Dense(25, activation = 'relu', activity_regularizer=tf.keras.regularizers.l1(0.001)))
    model.add(tf.keras.layers.Dense(len(trash_types.columns) - 1, activation = 'softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
    
# # Get AUC from cross-val
# model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=build_model, epochs=80, verbose=0)
# nn_cv_score = cross_val_score(model, X_train_prepared, y_train.values, cv=10, scoring='roc_auc')
# print('Mean AUC Score - Neural Network: ', nn_cv_score.mean())

# Train and evaluate on test data
nn = build_model()
nn_history = nn.fit(X_train_prepared, y_train.values, epochs=80, verbose=True)

nn_preds = nn.predict(X_test_prepared)
nn_fpr, nn_tpr, nn_thresholds = metrics.roc_curve(y_test.values, nn_preds)
nn_roc_auc = metrics.auc(nn_fpr, nn_tpr)

nn_output = nn.evaluate(X_test_prepared, y_test.values, verbose=False)
print('Test Batch NN AUC: ', nn_roc_auc)
print('Test Batch NN Accuracy: ', nn_output[1])

Train on 30007 samples
Epoch 1/80


ValueError: could not convert string to float: 'Plastic Pieces'