# import modules

In [80]:
# imports
import pandas as pd
import numpy as np
import filecmp

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# import and organise data

In [81]:
# file directories
path_4VSW = '4VSW_csv'
path_FALL = 'FALL_csv'
path_SPRING = 'SPRING_csv'
path_SUMMER = 'SUMMER_csv'

file_paths = [path_4VSW, path_FALL, path_SPRING, path_SUMMER]

In [82]:
# compare data dictionaries
dct_4vs = f'./{file_paths[0]}/DataDictionary_202008.xlsx'
dct_fal = f'./{file_paths[1]}/DataDictionary_202008.xlsx'
dct_spr = f'./{file_paths[2]}/DataDictionary_202008.xlsx'
dct_sum = f'./{file_paths[3]}/DataDictionary_202008.xlsx'

if filecmp.cmp(dct_4vs, dct_fal) and filecmp.cmp(dct_fal, dct_spr) and filecmp.cmp(dct_spr, dct_sum):
    print('All data dictionaries are the same.')
else:
    print('There are differences in the data dictionaries.')

All data dictionaries are the same.


In [83]:
# import data dictionary as dataframe
data_dictionary = pd.read_excel(dct_4vs)
data_dictionary.dropna(inplace=True)  # drop empty / NaN rows

# data_dictionary.drop('FRANÇAIS', axis=1)

In [84]:
# file names
GSCAT = '2020_GSCAT.csv'
GSDET = '2020_GSDET.csv'
GSINF = '2020_GSINF.csv'
GSSPECIES = '2020_GSSPECIES.csv'
GSMISSIONS = '2020GSMISSIONS.csv'

file_sfx = [GSCAT, GSDET, GSINF, GSSPECIES, GSMISSIONS]
file_names = ['GSCAT', 'GSDET', 'GSINF', 'GSSPECIES', 'GSMISSIONS']

In [85]:
# list of all files
all_files = []
for path in file_paths:
    for file in file_sfx:
        all_files.append(f"./{path}/{path[:-4]}_{file}")

# investigate and tidy data

### helper functions for viewing data

In [86]:
# helper functions to print information about files
# these functions presuppose the naming scheme "file_names", "all_files"

def get_season(file_number):
    return all_files[file_number].split('/')[1][:-4]

def get_filename(file_number):
    return file_names[file_number % 5]

def print_information(file_number):
    print(get_filename(file_number), f'({get_season(file_number)})')
    print()
    print('columns:', list(pd.read_csv(all_files[file_number]).columns))
    print('shape:', pd.read_csv(all_files[file_number]).shape)
    print('\ninfo:\n')
    print(pd.read_csv(all_files[file_number]).info())
#     print('\ndescribe:\n')
#     print(pd.read_csv(all_files[file_number]).describe())
    print('\n\n')
    

### helper function for combining seasons

In [87]:
def combine_seasons(list_of_files, dtype={}):
    """
    requires all_files naming structure defined above
    
    error handling:
        from GSDET (SPRING): 
            DtypeWarning: Columns (5,6) have mixed types.Specify dtype option on import or set low_memory=False.
                need to add a dtype kwarg to the combine_seasons() function
                could use **kwargs, but dtype={} seems less prone to error
    """
    
    # initialise empty dataframe
    df_output = pd.DataFrame()
    
    for file_number in list_of_files:
        dftemp = pd.read_csv(all_files[file_number], dtype=dtype)
        
        # add SEASON and FILE columns (temporarily) to keep track of data origins
        dftemp['SEASON'] = all_files[file_number].split('/')[1][:-4]
        dftemp['FILE'] = file_names[file_number % 5]
        
        df_output = pd.concat([df_output, dftemp], ignore_index=True)
    
    return df_output

### GSSPECIES (species info)

In [88]:
# investigate GSSPECIES 
filetype_index = 3
species_files = [filetype_index + 5*n for n in range(4)]

for i in species_files:
    print_information(i)

GSSPECIES (4VSW)

columns: ['SPEC', 'COMM', 'CODE', 'TSN']
shape: (329, 4)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SPEC    329 non-null    object 
 1   COMM    329 non-null    object 
 2   CODE    329 non-null    int64  
 3   TSN     0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 10.4+ KB
None



GSSPECIES (FALL)

columns: ['SPEC', 'COMM', 'CODE', 'TSN']
shape: (121, 4)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SPEC    121 non-null    object 
 1   COMM    121 non-null    object 
 2   CODE    121 non-null    int64  
 3   TSN     0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.9+ KB
None



GSSPECIES (SPRING)

columns: ['SPEC', 'CO

In [89]:
# combine GSSPECIES files
df_species = combine_seasons(species_files)
df_species.shape

(1737, 6)

In [90]:
# check to see if SPEC and CODE always match
list_of_species = list(df_species.CODE.unique())
inconsistencies = []

for species_code in list_of_species:
    if len(df_species[df_species.CODE == species_code].SPEC.unique()) != 1:
        inconsistencies.append(species_code)

# looks good - all species have consistent codes, can merge and drop SEASON/FILE
inconsistencies

[]

In [91]:
# confirm all species loop works
list_of_species = list(df_species.CODE.unique())
species_names = []

for species_code in list_of_species:
    if len(df_species[df_species.CODE == species_code].SPEC.unique()) == 1:
        species_names.append(df_species[df_species.CODE == species_code].COMM.reset_index(drop=True)[0])
        
# looks good
species_names[:10]

["GRAY'S CUTTHROAT EEL",
 'WOLF EELPOUT',
 'SNIPE EEL',
 'NORTHERN SAND LANCE',
 'STOUT SAWPALATE',
 'FISH DOCTOR',
 'COMMON WOLF EEL',
 'EELPOUT,NEWFOUNDLAND',
 "LAVAL'S EELPOUT",
 'SNAKE BLENNY']

In [92]:
# drop season and file columns and reindex
try:
    df_species.drop(['SEASON', 'FILE'], axis=1, inplace=True)
except:
    pass

df_species = df_species.drop_duplicates().sort_values('CODE').reset_index(drop=True)
df_species.head()

Unnamed: 0,SPEC,COMM,CODE,TSN
0,BALISTES CAPRISCUS,GRAY TRIGGERFISH,3,
1,STEPHANOLEPIS HISPIDUS,PLANEHEAD FILEFISH,6,
2,BOTHUS SP.,BOTHUS SP.,8,
3,GADUS MORHUA,COD(ATLANTIC),10,
4,MELANOGRAMMUS AEGLEFINUS,HADDOCK,11,


In [93]:
# check that no codes are duplicated
sum(df_species.CODE.duplicated(keep=False))

0

In [94]:
# every single TSN value is empty
# confirmed in Excel for 4 seasons
sum(df_species.TSN.isna())

880

In [95]:
# drop TSN column
try:  # will only drop it once, in case cell is run twice
    df_species.drop('TSN', axis=1, inplace=True)
except:
    pass

In [96]:
# reindex
df_species = df_species.set_index('CODE', drop=False) # code left in place for ease
df_species.index.name = 'index'

# reorder columns
df_species = df_species[['CODE', 'COMM', 'SPEC']]

df_species.head()

Unnamed: 0_level_0,CODE,COMM,SPEC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,3,GRAY TRIGGERFISH,BALISTES CAPRISCUS
6,6,PLANEHEAD FILEFISH,STEPHANOLEPIS HISPIDUS
8,8,BOTHUS SP.,BOTHUS SP.
10,10,COD(ATLANTIC),GADUS MORHUA
11,11,HADDOCK,MELANOGRAMMUS AEGLEFINUS


### GSMISSIONS (Missions and Vessels)

In [97]:
# investigate GSMISSIONS 
filetype_index = 4
mission_files = [filetype_index + 5*n for n in range(4)]

for i in mission_files:
    print_information(i)

GSMISSIONS (4VSW)

columns: ['MISSION', 'VESEL', 'CRUNO', 'YEAR', 'SEASON']
shape: (25, 5)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MISSION  25 non-null     object
 1   VESEL    25 non-null     object
 2   CRUNO    25 non-null     int64 
 3   YEAR     25 non-null     int64 
 4   SEASON   25 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.1+ KB
None



GSMISSIONS (FALL)

columns: ['MISSION', 'VESEL', 'CRUNO', 'YEAR', 'SEASON']
shape: (15, 5)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MISSION  15 non-null     object
 1   VESEL    15 non-null     object
 2   CRUNO    15 non-null     int64 
 3   YEAR     15 non-null     int64 
 4   SEASON   15 non-null     object
dtypes: int64(2), obje

In [98]:
# combine GMISSIONS files
df_missions = combine_seasons(mission_files)
df_missions.shape

(188, 6)

In [99]:
df_missions.MISSION.value_counts().sort_values().tail()

NED2017020    1
NED2015017    1
NED2016016    1
NED2006036    1
NED1995217    2
Name: MISSION, dtype: int64

In [100]:
df_missions[df_missions.MISSION == 'NED1995217']

Unnamed: 0,MISSION,VESEL,CRUNO,YEAR,SEASON,FILE
9,NED1995217,N,217,1995,4VSW,GSMISSIONS
64,NED1995217,N,217,1995,SPRING,GSMISSIONS


In [101]:
# confirm that date and time are included in GSINF
print_information(2)

GSINF (4VSW)

columns: ['MISSION', 'SETNO', 'SDATE', 'TIME', 'STRAT', 'SLAT', 'SLONG', 'ELAT', 'ELONG', 'DUR', 'DIST', 'SPEED', 'DEPTH', 'SURF_TEMP', 'BOTT_TEMP', 'BOTT_SAL', 'GEARDESC']
shape: (2024, 17)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2024 entries, 0 to 2023
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MISSION    2024 non-null   object 
 1   SETNO      2024 non-null   int64  
 2   SDATE      2024 non-null   object 
 3   TIME       2024 non-null   int64  
 4   STRAT      2024 non-null   int64  
 5   SLAT       2024 non-null   float64
 6   SLONG      2024 non-null   float64
 7   ELAT       1708 non-null   float64
 8   ELONG      1708 non-null   float64
 9   DUR        2024 non-null   int64  
 10  DIST       2024 non-null   float64
 11  SPEED      2024 non-null   float64
 12  DEPTH      2024 non-null   float64
 13  SURF_TEMP  1701 non-null   float64
 14  BOTT_TEMP  1638 non-null   float64


* without the season information, there are no inconsistent data
* since date and time are included in GSINF, it is appropriate to:
    * drop the season and file columns
    * delete the duplicated row(s)

In [102]:
# drop season and file columns, drop duplicated missions
try:
    df_missions.drop(['SEASON', 'FILE'], axis=1, inplace=True)
    df_missions = df_missions.drop_duplicates()
except:
    pass

In [103]:
# sort and reset index
df_missions = df_missions.sort_values('MISSION').reset_index(drop=True)

df_missions.head()

Unnamed: 0,MISSION,VESEL,CRUNO,YEAR
0,ATC1970175,A,175,1970
1,ATC1970176,A,176,1970
2,ATC1971188,A,188,1971
3,ATC1971189,A,189,1971
4,ATC1972200,A,200,1972


### GSINF

In [104]:
# investigate GSINF 
filetype_index = 2
gsinf_files = [filetype_index + 5*n for n in range(4)]

for i in gsinf_files:
    print_information(i)

GSINF (4VSW)

columns: ['MISSION', 'SETNO', 'SDATE', 'TIME', 'STRAT', 'SLAT', 'SLONG', 'ELAT', 'ELONG', 'DUR', 'DIST', 'SPEED', 'DEPTH', 'SURF_TEMP', 'BOTT_TEMP', 'BOTT_SAL', 'GEARDESC']
shape: (2024, 17)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2024 entries, 0 to 2023
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MISSION    2024 non-null   object 
 1   SETNO      2024 non-null   int64  
 2   SDATE      2024 non-null   object 
 3   TIME       2024 non-null   int64  
 4   STRAT      2024 non-null   int64  
 5   SLAT       2024 non-null   float64
 6   SLONG      2024 non-null   float64
 7   ELAT       1708 non-null   float64
 8   ELONG      1708 non-null   float64
 9   DUR        2024 non-null   int64  
 10  DIST       2024 non-null   float64
 11  SPEED      2024 non-null   float64
 12  DEPTH      2024 non-null   float64
 13  SURF_TEMP  1701 non-null   float64
 14  BOTT_TEMP  1638 non-null   float64


In [105]:
# combine GSINF files
df_gsinf = combine_seasons(gsinf_files)
df_gsinf.shape

(16645, 19)

In [106]:
# need to convert dates and times to pandas datetimes
df_gsinf.dtypes

MISSION       object
SETNO          int64
SDATE         object
TIME           int64
STRAT         object
SLAT         float64
SLONG        float64
ELAT         float64
ELONG        float64
DUR            int64
DIST         float64
SPEED        float64
DEPTH        float64
SURF_TEMP    float64
BOTT_TEMP    float64
BOTT_SAL     float64
GEARDESC      object
SEASON        object
FILE          object
dtype: object

In [107]:
df_gsinf.head()

Unnamed: 0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON,FILE
0,NED1997255,38,1997-03-15,614,402,45.541333,-58.1225,45.525167,-58.087333,30,1.78,3.56,131.67,0.02,1.61,32.67,Western IIA trawl,4VSW,GSINF
1,NED1997255,39,1997-03-15,814,402,45.545667,-57.9125,45.568833,-57.934,30,1.69,3.38,267.0,0.21,7.46,34.858,Western IIA trawl,4VSW,GSINF
2,NED1997255,40,1997-03-15,1005,400,45.573333,-57.796333,45.549167,-57.7755,30,1.71,3.42,409.65,0.16,5.39,34.875,Western IIA trawl,4VSW,GSINF
3,NED1997255,41,1997-03-15,1227,397,45.5335,-57.568833,45.509667,-57.539333,30,1.9,3.8,462.69,0.55,4.98,34.924,Western IIA trawl,4VSW,GSINF
4,NED1997255,42,1997-03-15,1513,400,45.416333,-57.670167,45.394333,-57.642333,30,1.78,3.56,427.94,,,,Western IIA trawl,4VSW,GSINF


In [108]:
df_gsinf[['SDATE', 'TIME']]

Unnamed: 0,SDATE,TIME
0,1997-03-15,614
1,1997-03-15,814
2,1997-03-15,1005
3,1997-03-15,1227
4,1997-03-15,1513
...,...,...
16640,2020-08-06,1036
16641,2020-08-07,30
16642,2020-08-07,218
16643,2020-08-07,651


In [109]:
# getting the proper datetime formatting to convert using pd.to_datetime
df_gsinf['SDATE'].astype(str) + ' ' + \
(df_gsinf.TIME // 100).astype(str) + ':' + \
(df_gsinf.TIME % 100).astype(str).str.pad(width=2, side='left', fillchar='0')

0         1997-03-15 6:14
1         1997-03-15 8:14
2        1997-03-15 10:05
3        1997-03-15 12:27
4        1997-03-15 15:13
               ...       
16640    2020-08-06 10:36
16641     2020-08-07 0:30
16642     2020-08-07 2:18
16643     2020-08-07 6:51
16644    2020-08-07 10:52
Length: 16645, dtype: object

In [110]:
# index values using pandas datetime formatting
df_gsinf.index = pd.to_datetime(
    # datetime formatting
    df_gsinf['SDATE'].astype(str) + ' ' + \
    (df_gsinf.TIME // 100).astype(str) + ':' + \
    (df_gsinf.TIME % 100).astype(str).str.pad(width=2, side='left', fillchar='0')
)
df_gsinf.index.name = 'date and time'

In [111]:
# sort by index (date and time)
df_gsinf.sort_index(inplace=True)
df_gsinf.head(10)

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON,FILE
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1970-07-06 18:25:00,ATC1970175,1,1970-07-06,1825,492,44.8,-66.7,,,30,2.3,4.6,85.95,8.7,6.78,32.25,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-07 01:45:00,ATC1970175,3,1970-07-07,145,492,44.983333,-65.916667,,,30,1.5,3.0,106.07,10.7,7.13,32.39,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-07 06:25:00,ATC1970175,4,1970-07-07,625,494,45.2,-65.416667,,,30,2.5,5.0,58.52,,8.69,31.26,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-07 15:25:00,ATC1970175,6,1970-07-07,1525,494,45.3,-65.15,,,30,1.7,3.4,58.52,10.4,9.37,30.99,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-07 18:15:00,ATC1970175,7,1970-07-07,1815,495,45.266667,-64.883333,,,30,1.0,2.0,51.21,10.7,6.72,30.91,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-07 22:45:00,ATC1970175,2,1970-07-07,2245,493,45.066667,-66.2,,,30,2.6,5.2,78.64,8.9,6.72,32.1,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-08 02:15:00,ATC1970175,9,1970-07-08,215,490,44.833333,-65.816667,,,30,1.5,3.0,91.44,12.1,6.81,32.39,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-08 06:45:00,ATC1970175,10,1970-07-08,645,492,44.716667,-66.466667,,,30,1.8,3.6,179.22,11.0,6.29,33.22,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-08 13:15:00,ATC1970175,12,1970-07-08,1315,491,44.483333,-66.4,,,30,1.6,3.2,186.54,9.6,6.42,33.4,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-08 16:55:00,ATC1970175,13,1970-07-08,1655,490,44.133333,-66.566667,,,30,1.7,3.4,89.61,10.1,7.61,32.88,Yankee #36 otter trawl,SUMMER,GSINF


In [112]:
# now we have datetime funcitonality with the dataframe
# https://pandas.pydata.org/docs/reference/arrays.html#datetime-data

# EXAMPLE: all GSINF for missions on '1970-07-10'
df_gsinf[df_gsinf.index.date == pd.to_datetime('1970-07-10').date()]

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON,FILE
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1970-07-10 03:05:00,ATC1970175,21,1970-07-10,305,483,42.466667,-67.333333,,,30,1.7,3.4,343.81,15.3,6.71,34.76,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-10 07:15:00,ATC1970175,22,1970-07-10,715,483,42.483333,-66.7,,,30,2.2,4.4,320.04,16.1,6.9,34.81,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-10 10:55:00,ATC1970175,23,1970-07-10,1055,482,42.45,-66.283333,,,30,1.8,3.6,246.89,13.9,7.11,34.69,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-10 14:25:00,ATC1970175,24,1970-07-10,1425,480,42.733333,-66.1,,,30,1.8,3.6,67.67,9.9,7.02,32.61,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-10 20:15:00,ATC1970175,26,1970-07-10,2015,480,42.6,-65.866667,,,30,2.0,4.0,84.12,13.1,5.6,32.57,Yankee #36 otter trawl,SUMMER,GSINF
1970-07-10 22:05:00,ATC1970175,20,1970-07-10,2205,482,42.633333,-66.833333,,,30,1.8,3.6,213.97,14.9,6.92,34.56,Yankee #36 otter trawl,SUMMER,GSINF


In [113]:
# UPDATED TO KEEP SEASON for DATABASE STEP (notebook 2)
## GSDET has a few null matches with GSINF
## using outer join keeps

# there are no duplicates, therefore we can drop 'SEASON' and 'FILE'
sum(df_gsinf.drop(['SEASON', 'FILE'], axis=1).duplicated(keep=False))

0

In [114]:
# drop 'FILE' column (NOT 'SEASON')
# NOTE: not reindexed because index is datatime for the entry
try:  # only drop it once
    df_gsinf.drop('FILE', axis=1, inplace=True)
except:
    pass

df_gsinf.head()

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1970-07-06 18:25:00,ATC1970175,1,1970-07-06,1825,492,44.8,-66.7,,,30,2.3,4.6,85.95,8.7,6.78,32.25,Yankee #36 otter trawl,SUMMER
1970-07-07 01:45:00,ATC1970175,3,1970-07-07,145,492,44.983333,-65.916667,,,30,1.5,3.0,106.07,10.7,7.13,32.39,Yankee #36 otter trawl,SUMMER
1970-07-07 06:25:00,ATC1970175,4,1970-07-07,625,494,45.2,-65.416667,,,30,2.5,5.0,58.52,,8.69,31.26,Yankee #36 otter trawl,SUMMER
1970-07-07 15:25:00,ATC1970175,6,1970-07-07,1525,494,45.3,-65.15,,,30,1.7,3.4,58.52,10.4,9.37,30.99,Yankee #36 otter trawl,SUMMER
1970-07-07 18:15:00,ATC1970175,7,1970-07-07,1815,495,45.266667,-64.883333,,,30,1.0,2.0,51.21,10.7,6.72,30.91,Yankee #36 otter trawl,SUMMER


### GSCAT 

In [115]:
# investigate GSCAT  
filetype_index = 0
gscat_files = [filetype_index + 5*n for n in range(4)]

for i in gscat_files:
    print_information(i)

GSCAT (4VSW)

columns: ['MISSION', 'SETNO', 'SPEC', 'TOTWGT', 'TOTNO']
shape: (26018, 5)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26018 entries, 0 to 26017
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   MISSION  26018 non-null  object 
 1   SETNO    26018 non-null  int64  
 2   SPEC     26018 non-null  int64  
 3   TOTWGT   26018 non-null  float64
 4   TOTNO    26018 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 1016.5+ KB
None



GSCAT (FALL)

columns: ['MISSION', 'SETNO', 'SPEC', 'TOTWGT', 'TOTNO']
shape: (11010, 5)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11010 entries, 0 to 11009
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MISSION  11010 non-null  object
 1   SETNO    11010 non-null  int64 
 2   SPEC     11010 non-null  int64 
 3   TOTWGT   11010 non-null  int64 
 4   TOTNO    11010 non-null 

In [116]:
# combine GSCAT files
df_gscat = combine_seasons(gscat_files)
df_gscat.shape

(232220, 7)

In [117]:
df_gscat.sort_values(['MISSION', 'SETNO']).head(10)

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO,SEASON,FILE
92804,ATC1970175,1,10,59.0,12,SUMMER,GSCAT
98347,ATC1970175,1,11,10.0,4,SUMMER,GSCAT
104229,ATC1970175,1,12,19.0,19,SUMMER,GSCAT
110099,ATC1970175,1,14,0.0,4,SUMMER,GSCAT
125894,ATC1970175,1,40,6.0,17,SUMMER,GSCAT
143380,ATC1970175,1,60,7.0,18,SUMMER,GSCAT
147365,ATC1970175,1,62,0.0,1,SUMMER,GSCAT
148953,ATC1970175,1,70,0.0,1,SUMMER,GSCAT
154714,ATC1970175,1,201,6.0,2,SUMMER,GSCAT
162606,ATC1970175,1,220,14.0,7,SUMMER,GSCAT


In [118]:
# check for duplicates
sum(df_gscat.drop(['SEASON', 'FILE'], axis=1).duplicated(keep=False))

0

In [119]:
df_gscat.head()

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO,SEASON,FILE
0,NED1988098,1,10,13.0,17,4VSW,GSCAT
1,NED1989117,1,10,5.0,65,4VSW,GSCAT
2,NED1992166,1,10,4.0,7,4VSW,GSCAT
3,NED1993182,1,10,1.0,10,4VSW,GSCAT
4,NED1999872,1,10,0.115,2,4VSW,GSCAT


In [120]:
# drop 'SEASON' and 'FILE' and reset index

try:  # only drop columns once
    df_gscat.drop(['SEASON', 'FILE'], axis=1, inplace=True).reset_index(drop=True)
except:
    pass

df_gscat.head()

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO
0,NED1988098,1,10,13.0,17
1,NED1989117,1,10,5.0,65
2,NED1992166,1,10,4.0,7
3,NED1993182,1,10,1.0,10
4,NED1999872,1,10,0.115,2


### GSDET

In [121]:
# investigate GSDET
filetype_index = 1
gsdet_files = [filetype_index + 5*n for n in range(4)]

for i in gsdet_files:
    print_information(i)

GSDET (4VSW)

columns: ['MISSION', 'SETNO', 'SPEC', 'FLEN', 'FWT', 'MATURITY', 'SEX', 'AGE', 'SPECIMEN_ID']
shape: (187974, 9)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187974 entries, 0 to 187973
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   MISSION      187974 non-null  object 
 1   SETNO        187974 non-null  int64  
 2   SPEC         187974 non-null  int64  
 3   FLEN         187974 non-null  int64  
 4   FWT          96207 non-null   float64
 5   MATURITY     18922 non-null   object 
 6   SEX          148096 non-null  object 
 7   AGE          14053 non-null   float64
 8   SPECIMEN_ID  0 non-null       float64
dtypes: float64(3), int64(3), object(3)
memory usage: 12.9+ MB
None



GSDET (FALL)

columns: ['MISSION', 'SETNO', 'SPEC', 'FLEN', 'FWT', 'MATURITY', 'SEX', 'AGE', 'SPECIMEN_ID']
shape: (129685, 9)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129685 entries, 0 to 12968

  print_information(i)


columns: ['MISSION', 'SETNO', 'SPEC', 'FLEN', 'FWT', 'MATURITY', 'SEX', 'AGE', 'SPECIMEN_ID']
shape: (594518, 9)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594518 entries, 0 to 594517
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   MISSION      594518 non-null  object 
 1   SETNO        594518 non-null  int64  
 2   SPEC         594518 non-null  int64  
 3   FLEN         594518 non-null  int64  
 4   FWT          227982 non-null  float64
 5   MATURITY     118806 non-null  object 
 6   SEX          385788 non-null  object 
 7   AGE          81247 non-null   float64
 8   SPECIMEN_ID  200925 non-null  float64
dtypes: float64(3), int64(3), object(3)
memory usage: 40.8+ MB
None



GSDET (SUMMER)

columns: ['MISSION', 'SETNO', 'SPEC', 'FLEN', 'FWT', 'MATURITY', 'SEX', 'AGE', 'SPECIMEN_ID']
shape: (1385852, 9)

info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1385852 entries, 0 to 1385851
Data co

##### GSDET (SPRING): DtypeWarning: Columns (5,6) have mixed types.Specify dtype option on import or set low_memory=False.
    need to add a dtype kwarg to the combine_seasons() function
    could use **kwargs, but dtype={} seems less prone to error

In [122]:
# combine GSDET files
df_gsdet = combine_seasons(gsdet_files, dtype={'MATURITY':object, 'SEX':object})
df_gsdet.shape

(2298029, 11)

In [123]:
df_gsdet.head(10)

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON,FILE
0,NED1999872,90,14,15.0,19.0,,UNKNOWN,,,4VSW,GSDET
1,NED1995217,108,10,43.0,,,UNKNOWN,,,4VSW,GSDET
2,NED1986060,45,11,51.0,1300.0,,UNKNOWN,,,4VSW,GSDET
3,NED1986060,7,11,46.0,1000.0,,UNKNOWN,,,4VSW,GSDET
4,NED1987078,82,40,11.0,,,UNKNOWN,,,4VSW,GSDET
5,NED1987078,51,300,26.0,,,UNKNOWN,,,4VSW,GSDET
6,NED1994201,96,11,42.0,,,UNKNOWN,,,4VSW,GSDET
7,NED1997255,33,41,8.0,2.0,,UNKNOWN,,,4VSW,GSDET
8,NED2000966,10,41,14.0,,,UNKNOWN,,,4VSW,GSDET
9,NED1989117,36,60,33.0,,,UNKNOWN,,,4VSW,GSDET


In [124]:
# check for duplicates
sum(df_gsdet.drop(['SEASON', 'FILE'], axis=1).duplicated(keep=False))

151158

In [125]:
# check for duplicates without the seasons/files noted
sum(df_gsdet.duplicated(keep=False))

151158

### Many repeated rows... why? 
##### BEST GUESS: these are not duplicates, they are samples with the same characteristics

##### checked this, no matches:
    check individual files for repeated rows
        maybe these coincide with joins to other tables (database not completely normalised?)
            if so, deleting all of the repeats and then joining properly should result in the same number of total rows

##### how many duplicates in each file?

In [126]:
gsdet0 = pd.read_csv(all_files[gsdet_files[0]], dtype={'MATURITY':object, 'SEX':object})
sum(gsdet0.duplicated(keep=False))

719

In [127]:
gsdet1 = pd.read_csv(all_files[gsdet_files[1]], dtype={'MATURITY':object, 'SEX':object})
sum(gsdet1.duplicated(keep=False))

10975

In [128]:
gsdet2 = pd.read_csv(all_files[gsdet_files[2]], dtype={'MATURITY':object, 'SEX':object})
sum(gsdet2.duplicated(keep=False))

45111

In [129]:
gsdet3 = pd.read_csv(all_files[gsdet_files[3]], dtype={'MATURITY':object, 'SEX':object})
sum(gsdet3.duplicated(keep=False))

94353

##### what is getting duplicated?

In [130]:
df_gsdet[df_gsdet.duplicated(keep=False)].sort_values(df_gsdet.columns.tolist()).head(20)

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON,FILE
994886,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1004723,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1044091,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1047370,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1018839,ATC1970175,2,10,61.0,,,UNKNOWN,,,SUMMER,GSDET
1036336,ATC1970175,2,10,61.0,,,UNKNOWN,,,SUMMER,GSDET
1516956,ATC1970175,2,14,26.0,100.0,Ripening 1,FEMALE,,,SUMMER,GSDET
1778172,ATC1970175,2,14,26.0,100.0,Ripening 1,FEMALE,,,SUMMER,GSDET
952304,ATC1970175,2,41,52.0,1100.0,,UNKNOWN,,,SUMMER,GSDET
965084,ATC1970175,2,41,52.0,1100.0,,UNKNOWN,,,SUMMER,GSDET


##### Which other file duplicates the same entries?
    * not SPECIES
    * not MISSIONS
    * not GSINF
    * not GSCAT

In [131]:
# NOT SPECIES
df_species[df_species.CODE == 60]

Unnamed: 0_level_0,CODE,COMM,SPEC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
60,60,HERRING(ATLANTIC),CLUPEA HARENGUS


In [132]:
# NOT MISSIONS
df_missions[df_missions.MISSION == 'ATC1970175']

Unnamed: 0,MISSION,VESEL,CRUNO,YEAR
0,ATC1970175,A,175,1970


In [133]:
# NOT GSINF

df_gsinf[(df_gsinf['MISSION'] == 'ATC1970175') & (df_gsinf['SETNO'] == 1)]

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1970-07-06 18:25:00,ATC1970175,1,1970-07-06,1825,492,44.8,-66.7,,,30,2.3,4.6,85.95,8.7,6.78,32.25,Yankee #36 otter trawl,SUMMER


In [134]:
# GSCAT might be the answer (spoiler: no)

df_gscat[(df_gscat['MISSION'] == 'ATC1970175') & (df_gscat['SETNO'] == 1)]

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO
92804,ATC1970175,1,10,59.0,12
98347,ATC1970175,1,11,10.0,4
104229,ATC1970175,1,12,19.0,19
110099,ATC1970175,1,14,0.0,4
125894,ATC1970175,1,40,6.0,17
143380,ATC1970175,1,60,7.0,18
147365,ATC1970175,1,62,0.0,1
148953,ATC1970175,1,70,0.0,1
154714,ATC1970175,1,201,6.0,2
162606,ATC1970175,1,220,14.0,7


In [135]:
# why are there 4 repeated rows in GSDET, but 14 in GSCAT?
df_gscat[(df_gscat['MISSION'] == 'ATC1970175') & (df_gscat['SETNO'] == 1)].shape

(14, 5)

In [136]:
# check GSCAT also filtering by species
df_gscat[(df_gscat['MISSION'] == 'ATC1970175') & (df_gscat['SETNO'] == 1) & (df_gscat['SPEC'] == 60)]

# NOT GSCAT

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO
143380,ATC1970175,1,60,7.0,18


In [137]:
# how many rows of GSDET match the above query?
df_gsdet[(df_gsdet['MISSION'] == 'ATC1970175') & (df_gsdet['SETNO'] == 1)].shape

(89, 11)

In [138]:
df_gsdet[(df_gsdet['MISSION'] == 'ATC1970175') & (df_gsdet['SETNO'] == 1)]

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON,FILE
942393,ATC1970175,1,60,30.0,500.0,,UNKNOWN,,,SUMMER,GSDET
967115,ATC1970175,1,62,29.0,,,UNKNOWN,,,SUMMER,GSDET
976703,ATC1970175,1,621,24.0,60.0,,UNKNOWN,,,SUMMER,GSDET
981865,ATC1970175,1,60,30.0,460.0,,UNKNOWN,,,SUMMER,GSDET
986462,ATC1970175,1,320,34.0,860.0,,UNKNOWN,,,SUMMER,GSDET
...,...,...,...,...,...,...,...,...,...,...,...
1652413,ATC1970175,1,12,91.0,7100.0,,FEMALE,,,SUMMER,GSDET
1698089,ATC1970175,1,40,38.0,475.0,Resting,FEMALE,6.0,,SUMMER,GSDET
1803350,ATC1970175,1,11,67.0,2800.0,Resting,FEMALE,7.0,,SUMMER,GSDET
1835302,ATC1970175,1,220,55.0,,,FEMALE,,,SUMMER,GSDET


In [139]:
df_gsdet[(df_gsdet['MISSION'] == 'ATC1970175') & (df_gsdet['SETNO'] == 1) & (df_gsdet['SPEC'] == 60)]

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON,FILE
942393,ATC1970175,1,60,30.0,500.0,,UNKNOWN,,,SUMMER,GSDET
981865,ATC1970175,1,60,30.0,460.0,,UNKNOWN,,,SUMMER,GSDET
994886,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1004723,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER,GSDET
1004730,ATC1970175,1,60,29.0,380.0,,UNKNOWN,,,SUMMER,GSDET
1014568,ATC1970175,1,60,30.0,450.0,,UNKNOWN,,,SUMMER,GSDET
1021232,ATC1970175,1,60,33.0,570.0,,UNKNOWN,,,SUMMER,GSDET
1027684,ATC1970175,1,60,26.0,300.0,,UNKNOWN,,,SUMMER,GSDET
1030972,ATC1970175,1,60,27.0,280.0,,UNKNOWN,,,SUMMER,GSDET
1034248,ATC1970175,1,60,27.0,300.0,,UNKNOWN,,,SUMMER,GSDET


In [140]:
df_gsdet[(df_gsdet['MISSION'] == 'ATC1970175') & (df_gsdet['SETNO'] == 1) & (df_gsdet['SPEC'] == 60)].shape

(18, 11)

### MY HYPOTHESIS:
##### These rows aren't duplicates. There are multiple specimen with identical:
    SPEC	FLEN	FWT	MATURITY	SEX	AGE	SPECIMEN_ID
    
Often, the fact that a few fields are NULL allow for easier match.

### I.E., THESE ARE NOT DUPLICATE ROWS (they just have the same value)

In [141]:
# final cleaning for GSDET, 
# delete 'FILE' column
# KEEP 'SEASON' column for reference
# reindex, but do not delete duplicates

try:  # only drop columns once
    df_gsdet.drop('FILE', axis=1, inplace=True)
    df_gsdet = df_gsdet.sort_values(['MISSION', 'SETNO', 'SPEC', 'FLEN', 'FWT']).reset_index(drop=True)
except:
    pass

df_gsdet.head()

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON
0,ATC1970175,1,10,28.0,220.0,Immature,MALE,2.0,,SUMMER
1,ATC1970175,1,10,58.0,2000.0,Resting,FEMALE,,,SUMMER
2,ATC1970175,1,10,64.0,3000.0,Immature,FEMALE,,,SUMMER
3,ATC1970175,1,10,68.0,3600.0,Resting,MALE,4.0,,SUMMER
4,ATC1970175,1,10,69.0,3200.0,Resting,MALE,5.0,,SUMMER


In [142]:
# confirm previous query to check index numbering and look for duplicates
# looks like it is working correctly
df_gsdet[(df_gsdet['MISSION'] == 'ATC1970175') & (df_gsdet['SETNO'] == 1) & (df_gsdet['SPEC'] == 60)]

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON
56,ATC1970175,1,60,24.0,230.0,,UNKNOWN,,,SUMMER
57,ATC1970175,1,60,25.0,230.0,,UNKNOWN,,,SUMMER
58,ATC1970175,1,60,26.0,270.0,,UNKNOWN,,,SUMMER
59,ATC1970175,1,60,26.0,300.0,,UNKNOWN,,,SUMMER
60,ATC1970175,1,60,27.0,280.0,,UNKNOWN,,,SUMMER
61,ATC1970175,1,60,27.0,300.0,,UNKNOWN,,,SUMMER
62,ATC1970175,1,60,29.0,380.0,,UNKNOWN,,,SUMMER
63,ATC1970175,1,60,29.0,420.0,,UNKNOWN,,,SUMMER
64,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER
65,ATC1970175,1,60,30.0,440.0,,UNKNOWN,,,SUMMER


In [143]:
df_gsdet.shape

(2298029, 10)

# EXPORT TABLES

In [144]:
# SPECIES
df_species.to_csv('SPECIES.csv')  # keep index

In [145]:
testing = pd.read_csv('SPECIES.csv', index_col = 'index')
testing.head()

Unnamed: 0_level_0,CODE,COMM,SPEC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,3,GRAY TRIGGERFISH,BALISTES CAPRISCUS
6,6,PLANEHEAD FILEFISH,STEPHANOLEPIS HISPIDUS
8,8,BOTHUS SP.,BOTHUS SP.
10,10,COD(ATLANTIC),GADUS MORHUA
11,11,HADDOCK,MELANOGRAMMUS AEGLEFINUS


In [146]:
df_species.head()

Unnamed: 0_level_0,CODE,COMM,SPEC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,3,GRAY TRIGGERFISH,BALISTES CAPRISCUS
6,6,PLANEHEAD FILEFISH,STEPHANOLEPIS HISPIDUS
8,8,BOTHUS SP.,BOTHUS SP.
10,10,COD(ATLANTIC),GADUS MORHUA
11,11,HADDOCK,MELANOGRAMMUS AEGLEFINUS


In [147]:
# MISSIONS
df_missions.to_csv('MISSIONS.csv', index=False)  # do not keep index

In [148]:
testing = pd.read_csv('MISSIONS.csv')
testing.head()

Unnamed: 0,MISSION,VESEL,CRUNO,YEAR
0,ATC1970175,A,175,1970
1,ATC1970176,A,176,1970
2,ATC1971188,A,188,1971
3,ATC1971189,A,189,1971
4,ATC1972200,A,200,1972


In [149]:
df_missions.head()

Unnamed: 0,MISSION,VESEL,CRUNO,YEAR
0,ATC1970175,A,175,1970
1,ATC1970176,A,176,1970
2,ATC1971188,A,188,1971
3,ATC1971189,A,189,1971
4,ATC1972200,A,200,1972


In [150]:
# GSCAT
df_gscat.to_csv('GSCAT.csv', index=False)

In [151]:
testing = pd.read_csv('GSCAT.csv')
testing.head()

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO
0,NED1988098,1,10,13.0,17
1,NED1989117,1,10,5.0,65
2,NED1992166,1,10,4.0,7
3,NED1993182,1,10,1.0,10
4,NED1999872,1,10,0.115,2


In [152]:
df_gscat.head()

Unnamed: 0,MISSION,SETNO,SPEC,TOTWGT,TOTNO
0,NED1988098,1,10,13.0,17
1,NED1989117,1,10,5.0,65
2,NED1992166,1,10,4.0,7
3,NED1993182,1,10,1.0,10
4,NED1999872,1,10,0.115,2


In [153]:
# GSINF
df_gsinf.to_csv('GSINF.csv')

In [154]:
testing = pd.read_csv('GSINF.csv', index_col='date and time')
testing.head()

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1970-07-06 18:25:00,ATC1970175,1,1970-07-06,1825,492,44.8,-66.7,,,30,2.3,4.6,85.95,8.7,6.78,32.25,Yankee #36 otter trawl,SUMMER
1970-07-07 01:45:00,ATC1970175,3,1970-07-07,145,492,44.983333,-65.916667,,,30,1.5,3.0,106.07,10.7,7.13,32.39,Yankee #36 otter trawl,SUMMER
1970-07-07 06:25:00,ATC1970175,4,1970-07-07,625,494,45.2,-65.416667,,,30,2.5,5.0,58.52,,8.69,31.26,Yankee #36 otter trawl,SUMMER
1970-07-07 15:25:00,ATC1970175,6,1970-07-07,1525,494,45.3,-65.15,,,30,1.7,3.4,58.52,10.4,9.37,30.99,Yankee #36 otter trawl,SUMMER
1970-07-07 18:15:00,ATC1970175,7,1970-07-07,1815,495,45.266667,-64.883333,,,30,1.0,2.0,51.21,10.7,6.72,30.91,Yankee #36 otter trawl,SUMMER


In [155]:
df_gsinf.head()

Unnamed: 0_level_0,MISSION,SETNO,SDATE,TIME,STRAT,SLAT,SLONG,ELAT,ELONG,DUR,DIST,SPEED,DEPTH,SURF_TEMP,BOTT_TEMP,BOTT_SAL,GEARDESC,SEASON
date and time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1970-07-06 18:25:00,ATC1970175,1,1970-07-06,1825,492,44.8,-66.7,,,30,2.3,4.6,85.95,8.7,6.78,32.25,Yankee #36 otter trawl,SUMMER
1970-07-07 01:45:00,ATC1970175,3,1970-07-07,145,492,44.983333,-65.916667,,,30,1.5,3.0,106.07,10.7,7.13,32.39,Yankee #36 otter trawl,SUMMER
1970-07-07 06:25:00,ATC1970175,4,1970-07-07,625,494,45.2,-65.416667,,,30,2.5,5.0,58.52,,8.69,31.26,Yankee #36 otter trawl,SUMMER
1970-07-07 15:25:00,ATC1970175,6,1970-07-07,1525,494,45.3,-65.15,,,30,1.7,3.4,58.52,10.4,9.37,30.99,Yankee #36 otter trawl,SUMMER
1970-07-07 18:15:00,ATC1970175,7,1970-07-07,1815,495,45.266667,-64.883333,,,30,1.0,2.0,51.21,10.7,6.72,30.91,Yankee #36 otter trawl,SUMMER


In [156]:
# GSDET
df_gsdet.to_csv('GSDET.csv', index=False)

In [157]:
testing = pd.read_csv('GSDET.csv')
testing.head()

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON
0,ATC1970175,1,10,28.0,220.0,Immature,MALE,2.0,,SUMMER
1,ATC1970175,1,10,58.0,2000.0,Resting,FEMALE,,,SUMMER
2,ATC1970175,1,10,64.0,3000.0,Immature,FEMALE,,,SUMMER
3,ATC1970175,1,10,68.0,3600.0,Resting,MALE,4.0,,SUMMER
4,ATC1970175,1,10,69.0,3200.0,Resting,MALE,5.0,,SUMMER


In [158]:
df_gsdet.head()

Unnamed: 0,MISSION,SETNO,SPEC,FLEN,FWT,MATURITY,SEX,AGE,SPECIMEN_ID,SEASON
0,ATC1970175,1,10,28.0,220.0,Immature,MALE,2.0,,SUMMER
1,ATC1970175,1,10,58.0,2000.0,Resting,FEMALE,,,SUMMER
2,ATC1970175,1,10,64.0,3000.0,Immature,FEMALE,,,SUMMER
3,ATC1970175,1,10,68.0,3600.0,Resting,MALE,4.0,,SUMMER
4,ATC1970175,1,10,69.0,3200.0,Resting,MALE,5.0,,SUMMER
