# Exploratory Data Analysis

In [1]:
%%html
<style>.dataframe td {white-space: nowrap;}</style>

In [2]:
# import modules
import pandas as pd
import numpy as np

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)

# plotting modules
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib as mpl
%matplotlib inline
mpl.rcParams["axes.formatter.min_exponent"] = 20  # no scientific notation in graphs
plt.rcParams['axes.titley'] = 1.0    # y is in axes-relative coordinates.
plt.rcParams['axes.titlepad'] = 10   # pad is in points... default is 6
import seaborn as sns
sns.set_theme()

In [3]:
%%time
# import data and functions
import rv_functions as rv

Wall time: 11.1 s


In [4]:
# rename imported data to remove rv prefix
try:
    GSCAT, GSDET, GSINF, MISSIONS, SPECIES, df = rv.GSCAT, rv.GSDET, rv.GSINF, rv.MISSIONS, rv.SPECIES, rv.df
    del rv.GSCAT, rv.GSDET, rv.GSINF, rv.MISSIONS, rv.SPECIES, rv.df
except:
    print('This step was already completed')

In [5]:
# list imported functions
list_of_imported_functions = dir(rv)
list_of_imported_functions = [x for x in list_of_imported_functions if x[0] != '_' and x not in ['pd', 'px']]

list_of_imported_functions

['aggregate_by_geo',
 'average_geo',
 'describe_species',
 'filter_by_min_species',
 'filter_by_species',
 'filter_dates',
 'filtered_monthly',
 'filtered_yearly',
 'format_monthly',
 'format_yearly',
 'get_species',
 'graph_species',
 'map_species',
 'print_species_data',
 'scatterplot_species',
 'search_species_by_name',
 'species_codes_by_percentile',
 'species_counts',
 'top_species_by_attribute',
 'top_x_species']

# Explore Species

### Most Common Species

### Deepest Species

In [6]:
%%time
rv.top_species_by_attribute(df, min_species=None)

Wall time: 69.8 ms


Unnamed: 0_level_0,NAME,MEAN_DEPTH,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
186,TAANINGICHTHYS MINIMUS,1_806,1
38,SWALLOWERS,1_806,1
750,CARDINALFISH,1_806,1
800,POROMITRA CRASSICEPS,1_667,1
494,SCOPELOBERYX ROBUSTUS,1_667,2
1032,BATHYTROCTES MICROLEPIS,1_667,1
749,SLICKHEAD,1_650,48
109,DAINTY MORA,1_640,23
1052,ROULEINA SP.,1_618,2
1019,ILYOPHIS BRUNNEUS,1_614,6


In [7]:
%%time
rv.top_x_species(df, 20)

Wall time: 29.9 ms


Unnamed: 0_level_0,NAME,COUNT
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1
11,HADDOCK,369782
14,SILVER HAKE,251446
23,REDFISH UNSEPARATED,179947
40,AMERICAN PLAICE,168269
10,COD(ATLANTIC),119753
42,YELLOWTAIL FLOUNDER,114973
60,HERRING(ATLANTIC),100350
4511,SHORT-FIN SQUID,71117
300,LONGHORN SCULPIN,69345
220,SPINY DOGFISH,62639


In [8]:
%%time
rv.top_species_by_attribute(df, attribute=None, min_species=None)

Wall time: 26.9 ms


Unnamed: 0_level_0,NAME,COUNT
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1
11,HADDOCK,369782
14,SILVER HAKE,251446
23,REDFISH UNSEPARATED,179947
40,AMERICAN PLAICE,168269
10,COD(ATLANTIC),119753
42,YELLOWTAIL FLOUNDER,114973
60,HERRING(ATLANTIC),100350
4511,SHORT-FIN SQUID,71117
300,LONGHORN SCULPIN,69345
220,SPINY DOGFISH,62639


In [9]:
rv.top_species_by_attribute(df, attribute='FWT')

Unnamed: 0_level_0,NAME,MEAN_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
72,SWORDFISH,110_500,2
592,SHARK (NS),75_000,1
230,"PORBEAGLE,MACKEREL SHARK",45_101,3
730,OCEAN SUNFISH,40_000,1
243,AMERICAN ATLANT STURGEON,22_000,10
216,ATLANTIC TORPEDO,15_650,20
900,SEALS (NS),11_240,3
25,TILE FISH,9_530,3
30,HALIBUT(ATLANTIC),3_752,5896
1072,CATAETYX LATICEPS,3_620,1


In [10]:
rv.top_species_by_attribute(df, attribute='FWT', aggregation='max')

Unnamed: 0_level_0,NAME,MAX_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
72,SWORDFISH,200_000,2
30,HALIBUT(ATLANTIC),92_490,5896
230,"PORBEAGLE,MACKEREL SHARK",90_000,3
592,SHARK (NS),75_000,1
216,ATLANTIC TORPEDO,46_850,20
730,OCEAN SUNFISH,40_000,1
10,COD(ATLANTIC),38_000,119753
243,AMERICAN ATLANT STURGEON,35_000,10
204,WINTER SKATE,33_615,46213
400,"MONKFISH,GOOSEFISH,ANGLER",27_200,6625


In [11]:
rv.top_species_by_attribute(df, attribute='FWT', aggregation='sum')

Unnamed: 0_level_0,NAME,SUM_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,COD(ATLANTIC),176_332_809,119753
11,HADDOCK,134_681_054,369782
16,POLLOCK,63_361_391,38193
220,SPINY DOGFISH,45_114_798,62639
12,WHITE HAKE,35_809_586,49453
201,THORNY SKATE,28_884_245,46668
23,REDFISH UNSEPARATED,27_407_574,179947
40,AMERICAN PLAICE,26_408_410,168269
204,WINTER SKATE,24_150_224,46213
30,HALIBUT(ATLANTIC),18_454_458,5896


In [13]:
rv.top_species_by_attribute(df, attribute='FWT', aggregation='max', date_min='2000', min_species=1000)

Unnamed: 0_level_0,NAME,MAX_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,HALIBUT(ATLANTIC),92_490,5896
204,WINTER SKATE,33_615,46213
10,COD(ATLANTIC),24_750,119753
12,WHITE HAKE,20_200,49453
200,BARNDOOR SKATE,16_040,1853
400,"MONKFISH,GOOSEFISH,ANGLER",14_330,6625
15,CUSK,13_840,2877
50,STRIPED ATLANTIC WOLFFISH,13_000,7274
201,THORNY SKATE,10_880,46668
16,POLLOCK,10_710,38193


In [16]:
rv.top_species_by_attribute(df, attribute='FWT', aggregation='max', date_min='2000', date_max='2001')
# is date unfiltering by count?
# yes, fixed
# maybe default min_species to None (to avoid confusion)

Unnamed: 0_level_0,NAME,MAX_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,HALIBUT(ATLANTIC),49_300,5896
10,COD(ATLANTIC),24_750,119753
12,WHITE HAKE,20_150,49453
50,STRIPED ATLANTIC WOLFFISH,9_810,7274
16,POLLOCK,9_340,38193
400,"MONKFISH,GOOSEFISH,ANGLER",8_000,6625
51,SPOTTED WOLFFISH,7_340,47
204,WINTER SKATE,6_815,46213
201,THORNY SKATE,5_940,46668
11,HADDOCK,5_200,369782


In [15]:
rv.top_species_by_attribute(df, attribute='FWT', aggregation='max', min_species=100)

Unnamed: 0_level_0,NAME,MAX_FWT,COUNTS
SPEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,HALIBUT(ATLANTIC),92_490,5896
10,COD(ATLANTIC),38_000,119753
204,WINTER SKATE,33_615,46213
400,"MONKFISH,GOOSEFISH,ANGLER",27_200,6625
12,WHITE HAKE,25_760,49453
200,BARNDOOR SKATE,19_000,1853
16,POLLOCK,17_000,38193
50,STRIPED ATLANTIC WOLFFISH,16_000,7274
201,THORNY SKATE,14_000,46668
15,CUSK,14_000,2877
