In [1]:
# Import necessary Python, sklearn and/or tensorflow/keras modules for loading the dataset
import os
import pandas as pd
import numpy as np


#Load the data
DATA_FILE = 'Data_Level5_BAH_OceanCleanup.csv'
if not os.path.exists(DATA_FILE):
    raise Exception('Data file not found. Make sure that the file is located in the same directory as the notebook')

df = pd.read_csv(DATA_FILE, sep=',', header=0, index_col=False)

# Basic overview of data shape, size, and type
df.info()

# Print data shape via built-in methods of sklearn, pandas or tensorflow/keras (or other modules)
print('\nDataframe shape: ', df.shape)

df.iloc[df['Miles'].argmax()]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37904 entries, 0 to 37903
Data columns (total 61 columns):
Cleanup ID                                   37904 non-null object
Zone                                         37665 non-null object
State                                        37577 non-null object
Country                                      37903 non-null object
GPS                                          37903 non-null object
Cleanup Type                                 37903 non-null object
Cleanup Date                                 37903 non-null object
Group Name                                   26636 non-null object
Adults                                       37903 non-null float64
Children                                     37903 non-null float64
People                                       37903 non-null float64
Pounds                                       37903 non-null float64
Miles                                        37903 non-null float64
# of bags      

  interactivity=interactivity, compiler=compiler, result=result)
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


Cleanup ID                                                                71463
Zone                                                Santa Clara County, CA, USA
State                                                           California, USA
Country                                                           United States
GPS                                                            37.422, -122.084
Cleanup Type                                 Land (beach, shoreline and inland)
Cleanup Date                                                          4/22/2019
Group Name                                                                  NaN
Adults                                                                        1
Children                                                                      0
People                                                                        1
Pounds                                                                     0.01
Miles                                   

In [2]:
for col in df:
    print(df[col].describe(), '\n\n')

count     37904
unique    37904
top       71820
freq          1
Name: Cleanup ID, dtype: int64 


count                           37665
unique                            900
top       Los Angeles County, CA, USA
freq                             2499
Name: Zone, dtype: object 


count            37577
unique              53
top       Florida, USA
freq             11319
Name: State, dtype: object 


count             37903
unique                1
top       United States
freq              37903
Name: Country, dtype: object 


count                   37903
unique                  33030
top       27.73249, -82.74755
freq                      218
Name: GPS, dtype: object 


count                                  37903
unique                                     3
top       Land (beach, shoreline and inland)
freq                                   37116
Name: Cleanup Type, dtype: object 


count         37903
unique         1730
top       9/15/2018
freq           3152
Name: Cleanup Date, dtype:

count     37904.000000
mean         18.597932
std        1810.697419
min           0.000000
25%           0.000000
50%           1.000000
75%           8.000000
max      352468.000000
Name: Other Trash (Clean Swell), dtype: float64 


count    37904.000000
mean         0.808727
std         78.887605
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      15327.000000
Name: Condoms, dtype: float64 


count    37904.000000
mean         0.761344
std         74.457778
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      14429.000000
Name: Diapers, dtype: float64 


count    37904.000000
mean         0.598459
std         58.585591
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      11342.000000
Name: Syringes, dtype: float64 


count    37904.000000
mean         1.355266
std        132.347617
min          0.000000
25%          0.000000
50%          0.000000
75%  

In [36]:
trash_types = df.loc[:, 'Cigarette Butts':'Plastic Pieces']  # isolate trash types
trash_types['Max'] = trash_types.loc[:].idxmax(axis=1)  # find most frequent trash item

cl_df = df.drop(df.loc[:,'Cigarette Butts':'Plastic Pieces'].head(0).columns, axis=1)  # separate trash classifier df
cl_df['most_freq_trash'] = trash_types['Max']

cl_df.head()
cl_df['most_freq_trash'].value_counts()

Cigarette Butts                              11796
Plastic Pieces                               10307
Food Wrappers (candy, chips, etc.)            2908
Other Trash (Clean Swell)                     2892
Beverage Bottles (Plastic)                    2026
Bottle Caps (Plastic)                         1602
Foam Pieces                                    845
Beverage Cans                                  760
Grocery Bags (Plastic)                         574
Straws, Stirrers                               574
Beverage Bottles (Glass)                       491
Glass Pieces                                   336
Other Packaging (Clean Swell)                  325
Take Out/Away Containers (Plastic)             281
Fishing Gear (Clean Swell)                     249
Tires                                          219
Toys                                           207
Balloons                                       201
Cups, Plates (Plastic)                         180
Lids (Plastic)                 

In [4]:
#  Used sklearn libraries

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

In [31]:
# Change format of 'date' from mm/dd/yyyy to yyyy and rename to 'year' 
cl_df['Cleanup Date'] = df['Cleanup Date'].str.split('/', n = 2, expand=False)
cl_df['year'] = cl_df['Cleanup Date'].str[2].astype('int')
cl_df = cl_df[cl_df['year'] > 1990]

# Get GPS x and y
cl_df[['gps_y','gps_x']] = cl_df.GPS.str.split(expand=True)
cl_df[['gps_y']] = cl_df['gps_y'].str.slice(0, -1)
cl_df[['gps_y', 'gps_x']] = cl_df[['gps_y', 'gps_x']].astype('float')

# Change format of 'date' from mm/dd/yyyy to yyyy and rename to 'year' 
cl_df['County'] = df['Cleanup Date'].str.split(',', n = 1, expand=False)
cl_df['year'] = cl_df['Cleanup Date'].str[2].astype('int')
cl_df = cl_df[cl_df['year'] > 1990]

cat_df_list = list(df.select_dtypes(include=['object']))
num_df_list = list(df.select_dtypes(include=['float64']))

cl_df.head()
#cl_df.info()

ValueError: cannot convert float NaN to integer

In [None]:
# Drop poor features
cl_df.drop(['Cleanup ID', 
            'GPS',
            'Group Name'], 
             axis=1,
             inplace=True)

In [6]:
# test = cl_df[cl_df['Cleanup Date'].split('/') != 'Draw']
# test.head()

AttributeError: 'Series' object has no attribute 'split'

In [None]:
# pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
    
])

# pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder()),
])

X_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, cat_df_list),
    ('num', num_pipeline, num_df_list),
])