# Exploratory Data Analysis

In [1]:
%%html
<style>.dataframe td {white-space: nowrap;}</style>

In [2]:
# import modules
import pandas as pd
import numpy as np

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)

# plotting modules
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib as mpl
%matplotlib inline
mpl.rcParams["axes.formatter.min_exponent"] = 20  # no scientific notation in graphs
plt.rcParams['axes.titley'] = 1.0    # y is in axes-relative coordinates.
plt.rcParams['axes.titlepad'] = 10   # pad is in points... default is 6
import seaborn as sns
sns.set_theme()

In [3]:
%%time
# import data and functions
import rv_functions as rv

Wall time: 11.3 s


In [4]:
# rename imported data to remove rv prefix
try:
    GSCAT, GSDET, GSINF, MISSIONS, SPECIES, df = rv.GSCAT, rv.GSDET, rv.GSINF, rv.MISSIONS, rv.SPECIES, rv.df
    del rv.GSCAT, rv.GSDET, rv.GSINF, rv.MISSIONS, rv.SPECIES, rv.df
except:
    print('This step was already completed')

In [5]:
# list imported functions
list_of_imported_functions = dir(rv)
list_of_imported_functions = [x for x in list_of_imported_functions if x[0] != '_' and x not in ['pd', 'px']]

list_of_imported_functions

['aggregate_by_geo',
 'average_geo',
 'describe_species',
 'filter_by_min_species',
 'filter_by_species',
 'filter_dates',
 'filtered_monthly',
 'filtered_yearly',
 'format_monthly',
 'format_yearly',
 'get_species',
 'graph_species',
 'map_species',
 'print_species_data',
 'scatterplot_species',
 'search_species_by_name',
 'species_codes_by_percentile',
 'species_counts']

# Explore Species

### Most Common Species

In [6]:
# NOTE: counting SPEC gives the correct number
# it is the same number of rows as the number TOTNO if dropping duplicate SETNO

how_many = 10  # most common species 

top_species = pd.DataFrame(df.SPEC.value_counts().head(how_many)).rename(columns={'SPEC':'Count'})
top_species['Code'] = top_species.index
top_species['Name'] = top_species['Code'].apply(rv.get_species)
top_species = top_species[['Code', 'Name', 'Count']].set_index('Code')
top_species

Unnamed: 0_level_0,Name,Count
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
11,HADDOCK,369782
14,SILVER HAKE,251446
23,REDFISH UNSEPARATED,179947
40,AMERICAN PLAICE,168269
10,COD(ATLANTIC),119753
42,YELLOWTAIL FLOUNDER,114973
60,HERRING(ATLANTIC),100350
4511,SHORT-FIN SQUID,71117
300,LONGHORN SCULPIN,69345
220,SPINY DOGFISH,62639


In [7]:
# for i in range(5):
#     species = top_species.index[i]
#     color='DEPTH'
#     date_min=None
#     date_max=None

#     rv.map_species(df, species, color=color, aggregate_data=True, verbose=False, date_min=date_min, date_max=date_max)

### Deepest Species

In [22]:
deep_species = pd.DataFrame(df.groupby('SPEC').DEPTH.mean().sort_values(ascending=False).head(how_many))

deep_species

Unnamed: 0_level_0,DEPTH
SPEC,Unnamed: 1_level_1
186,1_806
38,1_806
750,1_806
800,1_667
494,1_667
1032,1_667
749,1_650
109,1_640
1052,1_618
1019,1_614


In [52]:
how_many = 10 # deepest species

deep_species = pd.DataFrame(df.groupby('SPEC').DEPTH.mean().sort_values(ascending=False).head(how_many))
deep_species['Code'] = deep_species.index
deep_species.index.name = 'Code'
deep_species['Name'] = deep_species['Code'].apply(rv.get_species)
deep_species = deep_species[['Name', 'DEPTH']].rename(columns={'DEPTH': 'Mean Depth'})
deep_species['Counts'] = df.SPEC.value_counts()
deep_species

Unnamed: 0_level_0,Name,Mean Depth,Counts
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
186,TAANINGICHTHYS MINIMUS,1_806,1
38,SWALLOWERS,1_806,1
750,CARDINALFISH,1_806,1
800,POROMITRA CRASSICEPS,1_667,1
494,SCOPELOBERYX ROBUSTUS,1_667,2
1032,BATHYTROCTES MICROLEPIS,1_667,1
749,SLICKHEAD,1_650,48
109,DAINTY MORA,1_640,23
1052,ROULEINA SP.,1_618,2
1019,ILYOPHIS BRUNNEUS,1_614,6


In [53]:
df[df.SPEC == 749].SPEC.value_counts()
# this calc confirm works

749    48
Name: SPEC, dtype: Int64

In [58]:
# deepest species  (same as above)

how_many = 10  
min_species = 0  # min specimens per species, ie, ignore rare species

temp_df = rv.filter_by_min_species(df, min_species=min_species)
deep_species = pd.DataFrame(temp_df.groupby('SPEC').DEPTH.mean().sort_values(ascending=False).head(how_many))
deep_species['Code'] = deep_species.index
deep_species.index.name = 'Code'
deep_species['Name'] = deep_species['Code'].apply(rv.get_species)
deep_species = deep_species[['Name', 'DEPTH']].rename(columns={'DEPTH': 'Mean Depth'})
deep_species['Counts'] = df.SPEC.value_counts()
deep_species

Unnamed: 0_level_0,Name,Mean Depth,Counts
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
186,TAANINGICHTHYS MINIMUS,1_806,1
38,SWALLOWERS,1_806,1
750,CARDINALFISH,1_806,1
800,POROMITRA CRASSICEPS,1_667,1
494,SCOPELOBERYX ROBUSTUS,1_667,2
1032,BATHYTROCTES MICROLEPIS,1_667,1
749,SLICKHEAD,1_650,48
109,DAINTY MORA,1_640,23
1052,ROULEINA SP.,1_618,2
1019,ILYOPHIS BRUNNEUS,1_614,6


In [56]:
# deepest species

how_many = 10  
min_species = 100  # min specimens per species, ie, ignore rare species

temp_df = rv.filter_by_min_species(df, min_species=min_species)
deep_species = pd.DataFrame(temp_df.groupby('SPEC').DEPTH.mean().sort_values(ascending=False).head(how_many))
deep_species['Code'] = deep_species.index
deep_species.index.name = 'Code'
deep_species['Name'] = deep_species['Code'].apply(rv.get_species)
deep_species = deep_species[['Name', 'DEPTH']].rename(columns={'DEPTH': 'Mean Depth'})
deep_species['Counts'] = df.SPEC.value_counts()
deep_species

Unnamed: 0_level_0,Name,Mean Depth,Counts
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1028,HALOSAUROPSIS MACROCHIR,1_614,117
594,"SMOOTHHEAD,AGASSIZ'S",1_263,644
795,BEANS BLUEBACK,1_242,134
526,BATHYLAGUS BERICOIDES,1_223,167
113,BLUE ANTIMORA/HAKE,1_212,231
146,LAMPANYCTUS MACDONALDI,1_142,257
247,LONGNOSE CHIMERA,1_105,142
613,STOUT SAWPALATE,1_068,287
176,GOITRE BLACKSMELT,1_057,138
223,PORTUGUESE SHARK,1_048,203


In [57]:
# deepest common species

how_many = 10  # deepest species
min_species = 1000  # min specimens per species, ie, ignore rare species

temp_df = rv.filter_by_min_species(df, min_species=min_species)
deep_species = pd.DataFrame(temp_df.groupby('SPEC').DEPTH.mean().sort_values(ascending=False).head(how_many))
deep_species['Code'] = deep_species.index
deep_species.index.name = 'Code'
deep_species['Name'] = deep_species['Code'].apply(rv.get_species)
deep_species = deep_species[['Name', 'DEPTH']].rename(columns={'DEPTH': 'Mean Depth'})
deep_species['Counts'] = df.SPEC.value_counts()
deep_species

Unnamed: 0_level_0,Name,Mean Depth,Counts
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
602,GRAY'S CUTTHROAT EEL,977,1861
414,ROUNDNOSE GRENADIER,904,1225
159,BOA DRAGONFISH,602,1585
221,BLACK DOGFISH,520,7511
150,LANTERNFISH (NS),430,2619
410,MARLIN-SPIKE GRENADIER,426,6099
712,WHITE BARRACUDINA,425,1789
112,LONGFIN HAKE,319,15282
31,"TURBOT,GREENLAND HALIBUT",297,12315
19,OFF-SHORE HAKE,277,1158
