In [1]:
# Load system libraries.
import os
import re
import sys
import collections
import functools
import logging
logging.basicConfig(
    format='%(asctime)s %(message)s', 
    datefmt='%H:%M:%S',
    level=logging.INFO, 
    stream=sys.stdout
)

# Libraries for data wrangling.
import numpy as np
import pandas as pd
import scipy as sc
import sklearn
import joblib
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Machine learning.
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, Ridge, Lasso, LinearRegression
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor

from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.model_selection import cross_val_score, KFold

# Visualization libraries.
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#sns.set_context('poster')
#sns.set(rc={'figure.figsize': (3, 3)})
#sns.set_style('whitegrid')
tqdm = functools.partial(tqdm.tqdm, file=sys.stdout, position=0, leave=True)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Local modules.
%load_ext autoreload
%autoreload 2

In [2]:
breakup_data_df=pd.read_csv('data/NenanaIceClassic_1917-2024.csv')
breakup_data_df.head(10)

Unnamed: 0,Year,Decimal Day of Year,Month,Day,Time (AKST),Unnamed: 5,Unnamed: 6
0,1917,120.4792,April,30,11:30,,
1,1918,131.3979,May,11,9:33,,
2,1919,123.6063,May,3,14:33,,
3,1920,132.4486,May,11,10:46,,
4,1921,131.2792,May,11,6:42,,
5,1922,132.5556,May,12,13:20,,
6,1923,129.0833,May,9,2:00,,
7,1924,132.6319,May,11,15:10,,
8,1925,127.7722,May,7,18:32,,
9,1926,116.6688,April,26,16:03,,


In [3]:
breakup_data_df.columns.tolist()

['Year',
 'Decimal Day of Year',
 'Month',
 'Day',
 'Time (AKST)',
 'Unnamed: 5',
 'Unnamed: 6']

In [4]:
breakup_data_df.drop('Unnamed: 5', axis=1, inplace=True)
breakup_data_df.drop('Unnamed: 6', axis=1, inplace=True)
breakup_data_df.head()

Unnamed: 0,Year,Decimal Day of Year,Month,Day,Time (AKST)
0,1917,120.4792,April,30,11:30
1,1918,131.3979,May,11,9:33
2,1919,123.6063,May,3,14:33
3,1920,132.4486,May,11,10:46
4,1921,131.2792,May,11,6:42


In [5]:
breakup_data_df.isnull().sum()/len(breakup_data_df)*100

Year                   0.0
Decimal Day of Year    0.0
Month                  0.0
Day                    0.0
Time (AKST)            0.0
dtype: float64

In [6]:
breakup_data_df.dtypes

Year                     int64
Decimal Day of Year    float64
Month                   object
Day                      int64
Time (AKST)             object
dtype: object

In [7]:
breakup_data_df.rename(columns={'Decimal Day of Year': 'Decimal Time of Year'}, inplace=True)

In [None]:
breakup_data_df.replace({'April': 4, 'May': 5}, inplace=True)

In [10]:
breakup_data_df['Time in Hours(AKST)'] = breakup_data_df['Time (AKST)'].copy()

In [11]:
breakup_data_df['Time in Minutes(AKST)'] = None

In [12]:
breakup_data_df['Time in Minutes(AKST)'] = breakup_data_df['Time in Hours(AKST)'].copy()

In [13]:
breakup_data_df['Time in Hours(AKST)'] = breakup_data_df['Time in Hours(AKST)'].apply(lambda x:int(x.split(':')[0]))

In [14]:
breakup_data_df.dtypes

Year                       int64
Decimal Time of Year     float64
Month                      int64
Day                        int64
Time (AKST)               object
Time in Hours(AKST)        int64
Time in Minutes(AKST)     object
dtype: object

In [15]:
breakup_data_df['Time in Minutes(AKST)'] = breakup_data_df['Time in Minutes(AKST)'].apply(lambda x:int(x.split(':')[1]))

In [16]:
breakup_data_df.dtypes

Year                       int64
Decimal Time of Year     float64
Month                      int64
Day                        int64
Time (AKST)               object
Time in Hours(AKST)        int64
Time in Minutes(AKST)      int64
dtype: object

In [17]:
breakup_data_df['Time in Minutes(AKST)'] = breakup_data_df['Time in Minutes(AKST)'].apply(lambda x:x/60)

In [18]:
breakup_data_df = breakup_data_df.rename(columns={'Time in Minutes(AKST)': 'Minutes in Decimal(AKST)'})

In [20]:
freq = {col: breakup_data_df[col].value_counts() for col in breakup_data_df.columns}
for col, counts in freq.items():
  print(f"Freq counts for col '{col}':\n{counts}\n")

Freq counts for col 'Year':
Year
1917    1
1985    1
1996    1
1995    1
1994    1
       ..
1949    1
1948    1
1947    1
1946    1
2024    1
Name: count, Length: 108, dtype: int64

Freq counts for col 'Decimal Time of Year':
Decimal Time of Year
120.4792    1
131.6083    1
126.5222    1
116.5569    1
119.9590    1
           ..
134.5271    1
134.4674    1
123.7451    1
125.6944    1
118.2208    1
Name: count, Length: 108, dtype: int64

Freq counts for col 'Month':
Month
5    70
4    38
Name: count, dtype: int64

Freq counts for col 'Day':
Day
30    10
8      9
29     9
1      8
6      7
5      7
4      5
11     5
12     4
10     4
27     4
2      4
20     4
9      3
23     3
24     3
14     3
3      3
28     3
7      3
13     2
26     2
16     1
15     1
25     1
Name: count, dtype: int64

Freq counts for col 'Time (AKST)':
Time (AKST)
20:14    2
11:30    1
14:36    1
12:32    1
13:22    1
        ..
16:14    1
12:39    1
11:13    1
17:53    1
5:18     1
Name: count, Length: 107, dty