EDA on trawler dataset as a proxy for the more general dataset

In [1]:
import pandas as pd
import numpy as np
import time

import geopy.distance

import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interactive, FloatSlider

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, plot_roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

import eda

RANDOM = 42

In [2]:
fishing_path = "./data/fishing boats/"

trawler_raw_df = pd.read_csv(fishing_path + "trawlers.csv", parse_dates=['timestamp'], date_parser=lambda col: pd.to_datetime(col, unit='s'))

In [3]:
boats_raw_df = pd.read_csv(fishing_path + "fishing-vessels-v1.csv")

In [4]:
boats_raw_df

Unnamed: 0,mmsi,flag,geartype,length,tonnage,engine_power,active_2012,active_2013,active_2014,active_2015,active_2016
0,603100157,AGO,trawlers,32.808468,299.003814,733.826977,False,False,False,True,True
1,603100137,AGO,trawlers,34.568782,395.683171,864.960188,False,False,False,True,True
2,603100161,AGO,trawlers,28.822140,263.849149,651.809642,False,False,False,True,True
3,603100174,AGO,trawlers,30.721429,299.700916,703.796086,False,False,False,True,True
4,603100164,AGO,trawlers,37.479248,405.967747,850.976640,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...
73004,601764000,ZAF,drifting_longlines,17.032258,52.550920,207.818629,False,True,True,True,True
73005,601089100,ZAF,drifting_longlines,22.915622,180.506028,527.507463,False,False,False,True,True
73006,601849000,ZAF,drifting_longlines,19.692080,86.485661,362.553279,False,False,True,True,True
73007,601183700,ZAF,drifting_longlines,31.872860,307.446773,977.330342,False,False,True,True,True


In [5]:
trawler_raw_df

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,1.252340e+12,2012-01-01 00:00:38,0.000000,0.000000,0.0,153.000000,52.458649,4.581200,-1.0,gfw
1,1.252340e+12,2012-01-01 00:33:13,0.000000,0.000000,0.0,153.000000,52.458668,4.581167,-1.0,gfw
2,1.252340e+12,2012-01-01 00:43:14,0.000000,0.000000,0.0,153.000000,52.458633,4.581183,-1.0,gfw
3,1.252340e+12,2012-01-01 00:59:55,0.000000,0.000000,0.0,153.000000,52.458649,4.581234,-1.0,gfw
4,1.252340e+12,2012-01-01 01:16:00,0.000000,0.000000,0.0,153.000000,52.458649,4.581183,-1.0,gfw
...,...,...,...,...,...,...,...,...,...,...
4369096,1.838128e+14,2016-11-24 23:36:39,172998.640625,239263.859375,11.2,166.800003,-42.847645,175.192535,-1.0,false_positives
4369097,1.838128e+14,2016-11-24 23:38:37,173595.859375,240246.062500,11.7,163.100006,-42.853741,175.194473,-1.0,false_positives
4369098,1.838128e+14,2016-11-24 23:40:39,174995.703125,241416.937500,11.2,168.699997,-42.860050,175.196487,-1.0,false_positives
4369099,1.838128e+14,2016-11-24 23:41:38,174995.703125,241416.937500,11.5,169.100006,-42.863056,175.197464,-1.0,false_positives


In [6]:
trawler_raw_df.is_fishing.value_counts()

-1.000000    4191707
 0.000000     112999
 1.000000      61930
 0.666667       1639
 0.333333        760
 0.750000         34
 0.250000         32
Name: is_fishing, dtype: int64

In [7]:
trawler_df = trawler_raw_df[(trawler_raw_df.is_fishing == 1) | (trawler_raw_df.is_fishing == 0)]

In [8]:
trawler_df

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
60646,1.252340e+12,2015-01-01 05:08:23,0.000000,0.000000,0.0,128.000000,52.458717,4.581316,0.0,gfw
60647,1.252340e+12,2015-01-01 05:20:34,0.000000,0.000000,0.0,128.000000,52.458733,4.581316,0.0,gfw
60648,1.252340e+12,2015-01-01 05:32:53,0.000000,0.000000,0.0,128.000000,52.458698,4.581267,0.0,gfw
60649,1.252340e+12,2015-01-01 05:45:23,0.000000,0.000000,0.0,128.000000,52.458698,4.581234,0.0,gfw
60650,1.252340e+12,2015-01-01 05:57:24,0.000000,0.000000,0.0,128.000000,52.458683,4.581183,0.0,gfw
...,...,...,...,...,...,...,...,...,...,...
4324371,1.838128e+14,2015-08-28 10:30:09,23344.662109,54560.550781,10.3,123.300003,-40.340950,172.884033,0.0,false_positives
4324372,1.838128e+14,2015-08-28 10:46:14,21212.683594,54376.968750,10.2,120.400002,-40.362152,172.927811,0.0,false_positives
4324373,1.838128e+14,2015-08-28 11:04:38,18681.083984,55315.910156,10.1,124.400002,-40.393635,172.991333,0.0,false_positives
4324374,1.838128e+14,2015-08-28 11:21:16,16999.582031,56637.820312,10.4,121.099998,-40.420013,173.044662,0.0,false_positives


In [9]:
trawler_df.groupby('mmsi').mean().count()

distance_from_shore    45
distance_from_port     45
speed                  45
course                 45
lat                    45
lon                    45
is_fishing             45
dtype: int64

In [10]:
trawler_df.source.value_counts()

dalhousie_trawl    89974
gfw                80896
false_positives     4059
Name: source, dtype: int64

In [11]:
trawler_df[trawler_df.source == 'false_positives'].is_fishing.value_counts()

0.0    4059
Name: is_fishing, dtype: int64

In [12]:
trawler_df['year'] = pd.DatetimeIndex(trawler_df['timestamp']).year
trawler_df['month'] = pd.DatetimeIndex(trawler_df['timestamp']).month
trawler_df['day'] = pd.DatetimeIndex(trawler_df['timestamp']).day
trawler_df['hour'] = pd.DatetimeIndex(trawler_df['timestamp']).hour
trawler_df['minute'] = pd.DatetimeIndex(trawler_df['timestamp']).minute

In [13]:
trawler_df.groupby(['mmsi', 'year', 'month', 'day', 'hour', 'minute']).mean().is_fishing.value_counts()

0.0    106364
1.0     50511
Name: is_fishing, dtype: int64

In [14]:
trawler_df.groupby(['mmsi', 'year', 'month', 'day', 'hour', 'minute']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
mmsi,year,month,day,hour,minute,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.252340e+12,2015,1,1,5,8,0.000000,0.000000,0.0,128.000000,52.458717,4.581316,0.0
1.252340e+12,2015,1,1,5,20,0.000000,0.000000,0.0,128.000000,52.458733,4.581316,0.0
1.252340e+12,2015,1,1,5,32,0.000000,0.000000,0.0,128.000000,52.458698,4.581267,0.0
1.252340e+12,2015,1,1,5,45,0.000000,0.000000,0.0,128.000000,52.458698,4.581234,0.0
1.252340e+12,2015,1,1,5,57,0.000000,0.000000,0.0,128.000000,52.458683,4.581183,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2.775153e+14,2015,12,29,15,22,11045.089844,17463.820312,3.4,291.000000,43.636147,-7.092262,1.0
2.775153e+14,2015,12,29,19,33,0.000000,11401.474609,0.0,275.600006,43.656952,-7.351518,1.0
2.775153e+14,2015,12,30,0,4,16999.582031,29832.136719,3.2,89.800003,43.677372,-6.855945,0.0
2.775153e+14,2015,12,30,12,0,9999.754883,21259.769531,3.1,66.699997,43.635574,-6.906517,0.0


In [15]:
trawler_df.year.value_counts()

2015    127461
2014     23155
2013     16980
2012      7229
2016       104
Name: year, dtype: int64

In [16]:
trawler_mmsi = set(trawler_df.mmsi.values)

In [17]:
trawler_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4369101 entries, 0 to 4369100
Data columns (total 10 columns):
 #   Column               Dtype         
---  ------               -----         
 0   mmsi                 float64       
 1   timestamp            datetime64[ns]
 2   distance_from_shore  float64       
 3   distance_from_port   float64       
 4   speed                float64       
 5   course               float64       
 6   lat                  float64       
 7   lon                  float64       
 8   is_fishing           float64       
 9   source               object        
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 333.3+ MB


In [18]:
trawler_mmsi.isdisjoint(set(boats_raw_df.mmsi.values))

True

In [19]:
trawler_df['is_new_mmsi'] = (trawler_df.mmsi.shift(1) != trawler_df.mmsi.shift(0))

In [20]:
prev_list = ['speed', 'lat', 'lon']

for col in prev_list:
    trawler_df['prev_' + col] = trawler_df[col].shift(1)
    trawler_df['prev_' + col][trawler_df.is_new_mmsi] = np.nan

In [21]:
trawler_df[trawler_df.mmsi == list(trawler_mmsi)[-1]][['lat', 'prev_lat']]

Unnamed: 0,lat,prev_lat
231509,53.866112,
231510,53.857353,53.866112
231511,53.851585,53.857353
231514,53.840481,53.851585
231515,53.837486,53.840481
...,...,...
235625,53.522789,53.522785
235626,53.522827,53.522789
235627,53.522808,53.522827
235628,53.522808,53.522808


In [22]:
def getdist(df):
    if np.isnan(df.prev_lat):
        return np.nan
    else:
        return geopy.distance.distance((df.lat, df.lon), (df.prev_lat, df.prev_lon)).km

In [23]:
trawler_df['dist_moved'] = trawler_df.apply(getdist, axis=1)

60646            NaN
60647       0.001698
60648       0.005095
60649       0.002269
60650       0.003803
             ...    
4324371     5.202731
4324372     4.401632
4324373     6.428128
4324374     5.391887
4324375    21.120160
Name: dist_moved, Length: 174929, dtype: float64

In [24]:
trawler_df['time_taken'] = n. trawler_df.timestamp.diff(1)
trawler_df['time_taken'][trawler_df.is_new_mmsi] = np.nan

NameError: name 'n' is not defined

In [25]:
trawler_df[trawler_df.mmsi == list(trawler_mmsi)[-1]][['timestamp', 'time_taken']]

KeyError: "['time_taken'] not in index"

In [26]:
trawler_df.columns

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'is_fishing', 'source', 'year',
       'month', 'day', 'hour', 'minute', 'is_new_mmsi', 'prev_speed',
       'prev_lat', 'prev_lon', 'dist_moved'],
      dtype='object')

In [27]:
np.random.seed(42)
df_mask = np.random.rand(len(trawler_df)) < 0.8

trawler_train_df = trawler_df[df_mask].dropna()
trawler_test_df = trawler_df[~df_mask].dropna()

In [28]:
feature_cols = ['distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'year',
       'month', 'day', 'hour', 'minute', 'prev_speed',
       'dist_moved']

train_X = trawler_train_df[feature_cols]
train_y = trawler_train_df['is_fishing']

test_X = trawler_test_df[feature_cols]
test_y = trawler_test_df['is_fishing']

train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139843 entries, 60648 to 4324375
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   distance_from_shore  139843 non-null  float64
 1   distance_from_port   139843 non-null  float64
 2   speed                139843 non-null  float64
 3   course               139843 non-null  float64
 4   lat                  139843 non-null  float64
 5   lon                  139843 non-null  float64
 6   year                 139843 non-null  int64  
 7   month                139843 non-null  int64  
 8   day                  139843 non-null  int64  
 9   hour                 139843 non-null  int64  
 10  minute               139843 non-null  int64  
 11  prev_speed           139843 non-null  float64
 12  dist_moved           139843 non-null  float64
dtypes: float64(8), int64(5)
memory usage: 14.9 MB


In [29]:
model_knn = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors
model_knn_gscv = GridSearchCV(model_knn, param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM))

#fit model to data
model_knn_gscv.fit(train_X, train_y)
model_knn_gscv.best_params_

{'n_neighbors': 1}

In [30]:
model_knn_gscv.best_score_

0.8960191235752758

In [31]:
knn_y =  model_knn_gscv.predict(train_X)

In [32]:
make_confusion_matrix(model_knn_gscv, train_X, train_y, threshold=0.5)

NameError: name 'make_confusion_matrix' is not defined

Perfect fit!?

In [33]:
interactive(lambda threshold: make_confusion_matrix(model_knn_gscv, train_X, train_y, threshold=threshold, show_metrics=True), threshold=(0.0,1.0,0.01))

interactive(children=(FloatSlider(value=0.5, description='threshold', max=1.0, step=0.01), Output()), _dom_cla…

In [34]:
model_logreg_cv = LogisticRegressionCV(cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM))
model_logreg_cv.fit(train_X, train_y)

LogisticRegressionCV(Cs=10, class_weight=None,
                     cv=KFold(n_splits=5, random_state=42, shuffle=True),
                     dual=False, fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=None, max_iter=100, multi_class='auto',
                     n_jobs=None, penalty='l2', random_state=None, refit=True,
                     scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [35]:
interactive(lambda threshold: make_confusion_matrix(model_logreg_cv, test_X, test_y, threshold=threshold, show_metrics=True), threshold=(0.0,1.0,0.01))

interactive(children=(FloatSlider(value=0.5, description='threshold', max=1.0, step=0.01), Output()), _dom_cla…

In [36]:
threshold = 0.28

logreg_y = (model_logreg_cv.predict_proba(train_X)[:, 1] >= threshold)

f1_score(train_y, logreg_y)

0.7279276394781077

In [37]:
make_confusion_matrix(model_knn_gscv, test_X, test_y, threshold=0.5, show_metrics=True)

NameError: name 'make_confusion_matrix' is not defined

In [38]:
make_confusion_matrix(model_logreg_cv, test_X, test_y, threshold=0.28, show_metrics=True)

NameError: name 'make_confusion_matrix' is not defined

KNN likely overfitted. Let's try with neighbours > 1

In [39]:
model_knn2 = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(2, 25)}

#use gridsearch to test all values for n_neighbors
model_knn2_gscv = GridSearchCV(model_knn2, param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=RANDOM))

#fit model to data
model_knn2_gscv.fit(train_X, train_y)
model_knn2_gscv.best_params_

{'n_neighbors': 3}

In [40]:
knn2_y = model_knn2_gscv.predict(train_X)

In [41]:
make_confusion_matrix(model_knn2_gscv, train_X, train_y, threshold=0.50, show_metrics=True)

NameError: name 'make_confusion_matrix' is not defined

In [42]:
make_confusion_matrix(model_knn2_gscv, test_X, test_y, threshold=0.50, show_metrics=True)

NameError: name 'make_confusion_matrix' is not defined

Better performance