In [1]:
!pip3 install pycaret[full]
!pip3 install tqdm

Collecting pycaret[full]
[?25l  Downloading https://files.pythonhosted.org/packages/da/99/18f151991b0f06107af9723417c64e304ae2133587f85ea734a90136b4ae/pycaret-2.3.1-py3-none-any.whl (261kB)
[K     |████████████████████████████████| 266kB 3.9MB/s 
[?25hCollecting yellowbrick>=1.0.1
[?25l  Downloading https://files.pythonhosted.org/packages/3a/15/58feb940b6a2f52d3335cccf9e5d00704ec5ba62782da83f7e2abeca5e4b/yellowbrick-1.3.post1-py3-none-any.whl (271kB)
[K     |████████████████████████████████| 276kB 22.4MB/s 
[?25hCollecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/03/a5/15a0da6b0150b8b68610cc78af80364a80a9a4c8b6dd5ee549b8989d4b60/pyLDAvis-3.3.1.tar.gz (1.7MB)
[K     |████████████████████████████████| 1.7MB 22.9MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  

In [2]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from tqdm import tqdm 
import requests
import io
import plotly.express as px

import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import HTML

from ipywidgets import interact
import ipywidgets as widgets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import scipy as sci

In [3]:
value_url = "https://gitlab.com/JIMBambao/data-analytics-finals/-/raw/master/value-fishes.csv"
volume_url = "https://gitlab.com/JIMBambao/data-analytics-finals/-/raw/master/volume-fishes.csv"

value = pd.read_csv(io.StringIO(requests.get(value_url).content.decode('utf-8')))
volume = pd.read_csv(io.StringIO(requests.get(volume_url).content.decode('utf-8')))

In [4]:
def preprocess(df):
  data = df.melt(id_vars=['Species', 'Geolocation'])
  data['year'] = data.variable.str.split().str.get(0).astype('int32')
  data['quarter'] = data.variable.str.split().str.get(2).astype('int32')
  data['value'] = data.value.replace("..", np.nan, regex=False ).astype('float64')
  data = data.drop('variable', axis=1)
  data = data.dropna(subset=['value'])
  data['date'] = pd.PeriodIndex(year=data['year'], quarter=data['quarter']).to_timestamp()
  return data

def filters_old(df, regions, species, yearStart, yearEnd):
  data = preprocess(df)
  data = data.loc[data.Geolocation.isin(regions)]
  data = data.loc[data.Species.isin(species)]
  data = data.loc[(data.year >= yearStart) & (data.year <= yearEnd)]
  return data

def filters(data, geolocation, species):
  data = preprocess(data)
  data = data.loc[data.Geolocation == geolocation]
  data = data.loc[data.Species == species]
  return data

In [5]:
#Preprocess DataFrame for Modeling
def modeling_preprocess(data, species, geolocation):
  data = filters(data, geolocation, species)
  data['date'] = pd.PeriodIndex(year=data['year'], quarter=data['quarter']).to_timestamp()
  data['time_series'] = data[['Species', 'Geolocation']].apply(lambda x: '_'.join(x), axis=1)
  data['month'] = [i.month for i in data['date']]
  data['year'] = [i.year for i in data['date']]
  data['day_of_week'] = [i.dayofweek for i in data['date']]
  data['day_of_year'] = [i.dayofyear for i in data['date']]
  data = data.reset_index()
  data = data.drop(['Species', 'Geolocation', 'quarter', 'index'], axis=1)

  return data

#Getting the best model
def get_best_model(data, species, geolocation):
  data = modeling_preprocess(data, species, geolocation)
  s = setup(data, target = 'value', train_size = 0.95,
        data_split_shuffle = False, fold_strategy = 'timeseries', fold = 3,
        ignore_features = ['date', 'time_series'],
        numeric_features = ['day_of_year', 'year'],
        categorical_features = ['month', 'day_of_week'],
        silent = True, verbose = False, session_id = 123)
  return compare_models(sort = 'MAE', turbo=True, verbose=False)

#Finalizing the model for predictions
def get_finalized_model(data, species, geolocation):
  return finalize_model(get_best_model(data, species, geolocation))

#Preparation for prediction
def predict_date(year_end):
  all_dates = pd.date_range(start='2002', end = str(year_end), freq = 'QS')
  score_df = pd.DataFrame()
  score_df['date'] = all_dates
  score_df['month'] = [i.month for i in score_df['date']]
  score_df['year'] = [i.year for i in score_df['date']]
  score_df['day_of_week'] = [i.dayofweek for i in score_df['date']]
  score_df['day_of_year'] = [i.dayofyear for i in score_df['date']]
  return score_df

#Predict future depending on the model
def predict_future(data, model,species, geolocation, year_end):
  data = modeling_preprocess(data, species, geolocation)
  score_df = predict_date(year_end)
  all_score_df = []
  for i in tqdm(data['time_series'].unique()):
      # l = load_model('trained_models/' + format_fileName(str(i)), verbose=False)
      p = predict_model(model, data=score_df)
      p['time_series'] = i
      all_score_df.append(p)
  concat_df = pd.concat(all_score_df, axis=0)
  final_df = pd.merge(concat_df, data, how = 'left', left_on=['date', 'time_series'], right_on = ['date', 'time_series'])
  return final_df

#Predict future with the use of the best model
def predict_future_best_model(data, species, geolocation, year_end):
  model = get_finalized_model(data, species, geolocation)
  return predict_future(data, model,species, geolocation, year_end)

# Geolocation and Species are arrays containing the pair time series. 
# Data Append with cross multiplication of the geolocation and species
def predict_cross_multiplication(data, geolocation, species, year_end):
    geo_length = len(geolocation)
    spe_length = len(species)
    appended_table = pd.DataFrame()
    for i in range(0, spe_length):
        for j in range(0, geo_length):
            appended_table = appended_table.append(
                predict_future_best_model(data, species[i], geolocation[j], year_end)
            )
    return appended_table

# Geolocation and Species are arrays containing the pair time series. 
def predict_multiple_pairs(data, geolocation, species, year_end):
    geo_length = len(geolocation)
    spe_length = len(species)
    appended_table = pd.DataFrame()
    for i in range(0, spe_length):
         appended_table = appended_table.append(
             predict_future_best_model(data, species[i], geolocation[i], year_end)
        )
    return appended_table


In [6]:
#Plot the predictions
#Parameters comes from the result of either predict_future or predict_future_best_model

def create_sub_df(sub_df,data):

    return new_sub_df

def plot_predictions(final_df,data):
  for i in final_df['time_series'].unique()[:5]:
    sub_df = final_df[final_df['time_series'] == i]
    sub_df = sub_df.melt(id_vars=['date', 'month_x', 'year_x', 'day_of_week_x', 'day_of_year_x',
       'time_series', 'year_y', 'month_y', 'day_of_week_y',
       'day_of_year_y'], var_name='var', value_name=data)
    sub_df['var']= sub_df['var'].replace(['Label'],'Predicted '+data)
    sub_df['var']= sub_df['var'].replace(['value'],'Actual '+data)
    fig = px.line(sub_df, x='date', y=data, color='var', title = i, template = 'plotly_dark', labels=dict(x="Date", y="Value"))
    fig.data[0].name = 'Predicted ' + data
    fig.data[1].name = 'Actual ' + data
    fig.show()

#final_df = Appended Dataframe
#predict = 1(Predicted) or 0(Actual)
#title = graph title
def multi_plot_predictions(final_df,predict,data,title):
  sub_df = pd.DataFrame(columns = final_df.columns)
  #graph predicted values
  if predict == 1:
    y_type = 'Label'
    title = title + " (Predicted)"
  #graph actual values
  else:
    y_type = 'value'
    title = title + " (Actual)"
  for i in final_df['time_series'].unique()[:5]:
    tmp_df = final_df[final_df['time_series'] == i]
    sub_df = sub_df.append(tmp_df)
  sub_df = sub_df.melt(id_vars=['date', 'month_x', 'year_x', 'day_of_week_x', 'day_of_year_x',
       'time_series', 'year_y', 'month_y', 'day_of_week_y',
       'day_of_year_y'], var_name='var', value_name=data)
  val_sub=sub_df.loc[sub_df['var'] == y_type]
  fig = px.line(sub_df, x=val_sub.date, y=val_sub[data], color=val_sub.time_series, title=title, template = 'plotly_dark', labels=dict(x="Date", y="Value"))
  fig.show()

#Displays the bar graph top 10 species depending on the parameters
def top_10_species(data, geolocation, year, quarter):
  data = preprocess(data).copy()
  data = data.loc[ (data.year == year) &
            (data.quarter == quarter) &
          (data.Geolocation == geolocation) &
          (data.Species != "All species")
          ].sort_values(by="value", ascending=False)
  data['Species'] = data.Species.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
  fig = px.bar(data.head(10), x='Species', y='value', template = 'plotly_dark')
  fig.show()

#Displays the bar graph top 10 Locations depending on the parameters
def top_10_geolocation(data, species, year, quarter):
  data = preprocess(data).copy()
  data = data.loc[ (data.year == year) &
            (data.quarter == quarter) &
          (data.Species == species) &
          (data.Geolocation != "Philippines") &
          (data.Geolocation.str.contains("....", regex=False))
          ].sort_values(by="value", ascending=False)
  data['Geolocation'] = data.Geolocation.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
  fig = px.bar(data.head(10), x='Geolocation', y='value', template = 'plotly_dark')
  fig.show()


# Descriptive Analysis

## Top 10 Species in an area

In [7]:
import string

type_dropdown = widgets.RadioButtons(
    options=['value','volume'],
    value = "value",
    description='Field Type:',
    disabled=False,
)

type2_dropdown = widgets.RadioButtons(
    options=['predicted','actual'],
    value = "predicted",
    description='Field Type:',
    disabled=False,
)

species_dropdown = widgets.Dropdown(
    options=value.Species.unique(),
    value = "All species",
    description='Species:',
    disabled=False,
)

geo_dropdown = widgets.Dropdown(
    options=value.Geolocation.unique(),
    value = "Philippines",
    description='Geolocation:',
    disabled=False,
)

year_end_slider = widgets.IntSlider(
    value=2050,
    min=2002,
    max=2100,
    step=1,
    description='Year end:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

quarter_slider = widgets.IntSlider(
    value=1,
    min=1,
    max=4,
    step=1,
    description='Quarter:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)


target_year = widgets.IntSlider(
    value=2021,
    min=2002,
    max=2021,
    step=1,
    description='Target Year:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

species_a = widgets.Dropdown(
    options=value.Species.unique(),
    value = "..Milkfish",
    description='Species 1st Pair:',
    disabled=False,
)

geo_a = widgets.Dropdown(
    options=value.Geolocation.unique(),
    value = "....Pangasinan",
    description='Geolocation 1st Pair:',
    disabled=False,
)

species_b = widgets.Dropdown(
    options=value.Species.unique(),
    value = "..Tilapia",
    description='Species 2nd Pair:',
    disabled=False,
)

geo_b = widgets.Dropdown(
    options=value.Geolocation.unique(),
    value = "..Region III (Central Luzon)",
    description='Geolocation 2nd Pair:',
    disabled=False,
)

species_c = widgets.Dropdown(
    options=value.Species.unique(),
    value = "All species",
    description='Species 3rd Pair:',
    disabled=False,
)

geo_c = widgets.Dropdown(
    options=value.Geolocation.unique(),
    value = "..Cordillera Administrative Region (CAR)",
    description='Geolocation 3rd Pair:',
    disabled=False,
)

control = 'value'
control2 = 'actual'

#Displays the bar graph top 10 species depending on the parameters

def val_vol(dataset):
  control = dataset
  if control == 'value':
    def plot_top_10(geo, year, quart):
      return top_10_species(value, geo, year, quart)
    widgets.interact(plot_top_10, geo=geo_dropdown, year = target_year, quart=quarter_slider)

  elif control == 'volume':
    def plot_top_10(geo, year, quart):
      return top_10_species(volume, geo, year, quart)
    widgets.interact(plot_top_10, geo=geo_dropdown, year = target_year, quart=quarter_slider)

interact(val_vol, dataset = type_dropdown)


interactive(children=(RadioButtons(description='Field Type:', options=('value', 'volume'), value='value'), Out…

<function __main__.val_vol>

## Top 10 Locations

In [8]:
#Displays the bar graph top 10 species depending on the parameters
def top_10_geolocation(data, species, year, quarter):
  data = preprocess(data).copy()
  data = data.loc[ (data.year == year) &
            (data.quarter == quarter) &
          (data.Species == species) &
          (data.Geolocation != "Philippines") &
          (data.Geolocation.str.contains("....", regex=False))
          ].sort_values(by="value", ascending=False)
  data['Geolocation'] = data.Geolocation.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
  fig = px.bar(data.head(10), x='Geolocation', y='value', template = 'plotly_dark')
  fig.show()

def val_vol(dataset):
  control = dataset
  if control == 'value':
    def plot_top_10(spec, year, quart):
      return top_10_geolocation(value, spec, year, quart)
    widgets.interact(plot_top_10, spec=species_dropdown, year = target_year, quart=quarter_slider)

  elif control == 'volume':
    def plot_top_10(spec, year, quart):
      return top_10_geolocation(volume, spec, year, quart)
    widgets.interact(plot_top_10, spec=species_dropdown, year = target_year, quart=quarter_slider)

interact(val_vol, dataset = type_dropdown)


interactive(children=(RadioButtons(description='Field Type:', options=('value', 'volume'), value='value'), Out…

<function __main__.val_vol>

In [9]:

def val_vol(dataset):
  control = dataset
  if control == 'value':
    def plot_top_10(spec, year, quart):
      return top_10_geolocation(value, spec, year, quart)
    widgets.interact(plot_top_10, spec=species_dropdown, year = target_year, quart=quarter_slider)

  elif control == 'volume':
    def plot_top_10(spec, year, quart):
      return top_10_geolocation(volume, spec, year, quart)
    widgets.interact(plot_top_10, spec=species_dropdown, year = target_year, quart=quarter_slider)

interact(val_vol, dataset = type_dropdown)


interactive(children=(RadioButtons(description='Field Type:', options=('value', 'volume'), value='value'), Out…

<function __main__.val_vol>

# Predictions Analysis

In [10]:
g_dropdown = widgets.Dropdown(
    options=value.Geolocation.unique(),
    value = "Philippines",
    description='Geolocation:',
    disabled=False,
)

s_dropdown = widgets.Dropdown(
    options=value.Species.unique(),
    value = "All species",
    description='Species:',
    disabled=False,
)

t_dropdown = widgets.RadioButtons(
    options=['value','volume'],
    value = "value",
    description='Field Type:',
    disabled=False,
)

control = 'value'

def vol_val (data):
  control = data
  if control == 'value':
    def on_dropdown_chosen( s_value, g_value, y_value):
      a = predict_future_best_model(value, s_value, g_value, y_value)
      print('Model Used: ' + pull().sort_values(by="MAE", ascending=True).Model[0])
      return plot_predictions(a,control)
    widgets.interact(on_dropdown_chosen, s_value = s_dropdown, g_value = g_dropdown, y_value = year_end_slider)

  elif control == 'volume':
    def on_dropdown_chosen( s_value, g_value, y_value):
      a = predict_future_best_model(volume, s_value, g_value, y_value)
      print('Model Used: ' + pull().sort_values(by="MAE", ascending=True).Model[0])
      return plot_predictions(a,control)
    widgets.interact(on_dropdown_chosen, s_value = s_dropdown, g_value = g_dropdown, y_value = year_end_slider)

interact(vol_val, data = t_dropdown)


interactive(children=(RadioButtons(description='Field Type:', options=('value', 'volume'), value='value'), Out…

<function __main__.vol_val>

# Compare Species

In [11]:
ty_dropdown = widgets.RadioButtons(
    options=['value','volume'],
    value = "value",
    description='Field Type:',
    disabled=False,
)

y_end_slider = widgets.IntSlider(
    value=2050,
    min=2002,
    max=2100,
    step=1,
    description='Year end:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

def field_type (field):
  control = field
  if control == 'value' :
    def a_or_f (type_value):
      control2 = type_value
      if control2 == 'actual':
        def pair_chosen(a, b, c, d, e, f, g):
          geol = [geo_a.value, geo_b.value, geo_c.value]
          spec = [species_a.value, species_b.value, species_c.value]
          b = predict_multiple_pairs(value, geol, spec, g)
          multi_plot_predictions(b,0,'value',"Value")
        widgets.interact(pair_chosen, a = species_a, b = geo_a, c=species_b, d=geo_b, e=species_c, f=geo_c, g=year_end_slider)
      
      elif control2 == 'predicted':
        def pair_chosen(a, b, c, d, e, f, g):
          geol = [geo_a.value, geo_b.value, geo_c.value]
          spec = [species_a.value, species_b.value, species_c.value]
          b = predict_multiple_pairs(value, geol, spec, g)
          multi_plot_predictions(b,1,'value',"Value")
        widgets.interact(pair_chosen, a = species_a, b = geo_a, c=species_b, d=geo_b, e=species_c, f=geo_c, g=year_end_slider)

    interact(a_or_f, type_value = type2_dropdown)

  elif control == 'volume':
    def a_or_f (type_value):
      control2 = type_value
      if control2 == 'actual':
        def pair_chosen(a, b, c, d, e, f, g):
          geol = [geo_a.value, geo_b.value, geo_c.value]
          spec = [species_a.value, species_b.value, species_c.value]
          b = predict_multiple_pairs(volume, geol, spec, g)
          multi_plot_predictions(b,0,'volume',"Volume")
        widgets.interact(pair_chosen, a = species_a, b = geo_a, c=species_b, d=geo_b, e=species_c, f=geo_c, g=y_end_slider)

      elif control2 == 'predicted':
        def pair_chosen(a, b, c, d, e, f, g):
          geol = [geo_a.value, geo_b.value, geo_c.value]
          spec = [species_a.value, species_b.value, species_c.value]
          b = predict_multiple_pairs(value, geol, spec, g)
          multi_plot_predictions(b,1,'volume',"Volume")
        widgets.interact(pair_chosen, a = species_a, b = geo_a, c=species_b, d=geo_b, e=species_c, f=geo_c, g=y_end_slider)

    interact(a_or_f, type_value = type2_dropdown)
interact(field_type, field = ty_dropdown)

interactive(children=(RadioButtons(description='Field Type:', options=('value', 'volume'), value='value'), Out…

<function __main__.field_type>

In [12]:
# pull().sort_values(by="MAE", ascending=True)