In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML(""))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
df_LF = pd.read_csv(r'.\provided data\Margaree Gaspereau lgth-freq_Master.csv')

In [3]:
df_LF.dtypes

yy          int64
mm          int64
dd          int64
Time      float64
river      object
week        int64
site       object
loc        object
period     object
wt_lbs    float64
wt_kg     float64
lgth        int64
freq        int64
Flbin       int64
dtype: object

In [4]:
df_LF.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
yy,11440.0,,,,2003.929,9.465,1990.0,1995.0,2003.0,2013.0,2019.0
mm,11440.0,,,,5.427,0.495,5.0,5.0,5.0,6.0,6.0
dd,11440.0,,,,16.41,8.882,1.0,9.0,17.0,24.0,31.0
Time,1457.0,,,,1030.411,531.799,3.0,900.0,1115.0,1425.0,1850.0
river,10222.0,3.0,SW MARGAREE,6240.0,,,,,,,
week,11440.0,,,,4.908,1.909,1.0,4.0,5.0,6.0,9.0
site,11371.0,91.0,26,2402.0,,,,,,,
loc,11358.0,4.0,LOWER,6406.0,,,,,,,
period,11440.0,3.0,AM,6143.0,,,,,,,
wt_lbs,10222.0,,,,102.868,34.969,1.0,81.7,100.0,127.0,204.0


# Datetime

In [5]:
df_LF.Time.unique()

array([  nan,    3.,    4.,    5., 1305., 1530., 1025., 1550., 1255.,
       1405., 1455., 1310., 1220., 1115., 1640., 1355., 1320., 1400.,
       1225., 1035., 1010., 1545., 1425., 1100., 1450., 1540., 1430.,
       1030., 1345., 1630., 1050.,  850.,  925., 1445.,  940.,  900.,
        840., 1500.,  905., 1713.,  835.,  845., 1000., 1145., 1015.,
       1625., 1745., 1315., 1655., 1020., 1210., 1755., 1215., 1110.,
       1605., 1130., 1235., 1300., 1200., 1850., 1700.])

In [7]:
# convert all times to same format, then to datetime format
df_LF['CleanTime'] = df_LF['Time']
df_LF.loc[df_LF['CleanTime'] < 10, 'CleanTime'] = df_LF.loc[df_LF['CleanTime'] < 10, 'CleanTime'] * 100
df_LF['CleanTime'] = pd.to_datetime(df_LF.CleanTime, format='%H%M')

df_LF['DATETIME'] = pd.to_datetime(dict(
    year=df_LF['yy'], 
    month=df_LF['mm'], 
    day=df_LF['dd'], 
    hour=df_LF['CleanTime'].dt.hour, 
    minute=df_LF['CleanTime'].dt.minute, 
), errors='coerce')

df_LF = df_LF.drop('CleanTime', axis=1)

In [8]:
df_LF['DATETIME'].describe(datetime_is_numeric=True)

count                             1457
mean     2007-02-28 06:23:05.353466112
min                1991-05-12 03:00:00
25%                2009-05-23 15:50:00
50%                2010-05-22 09:25:00
75%                2011-05-17 10:25:00
max                2012-06-21 17:00:00
Name: DATETIME, dtype: object

In [9]:
# percentage of null dates
sum(df_LF.DATETIME.isnull())/df_LF.shape[0]

0.8726398601398602

In [10]:
# check to make sure hours are good
pd.concat([
    df_LF['Time'], 
    df_LF['DATETIME'].dt.hour,
    df_LF['DATETIME'].dt.minute
], axis=1)[~df_LF['Time'].isnull()]

Unnamed: 0,Time,DATETIME,DATETIME.1
948,3.000,3.000,0.000
949,3.000,3.000,0.000
950,3.000,3.000,0.000
951,3.000,3.000,0.000
952,3.000,3.000,0.000
...,...,...,...
7815,1700.000,17.000,0.000
7816,1700.000,17.000,0.000
7817,1700.000,17.000,0.000
7818,1700.000,17.000,0.000


In [11]:
df_LF[(~df_LF.Time.isnull()) & (df_LF.DATETIME.isnull())]

Unnamed: 0,yy,mm,dd,Time,river,week,site,loc,period,wt_lbs,wt_kg,lgth,freq,Flbin,DATETIME


In [12]:
df_LF.iloc[948]

yy                         1991
mm                            5
dd                           12
Time                      3.000
river                  MARGAREE
week                          3
site                         12
loc                       LOWER
period                       AM
wt_lbs                    1.000
wt_kg                     0.500
lgth                        265
freq                          1
Flbin                       265
DATETIME    1991-05-12 03:00:00
Name: 948, dtype: object