In [1]:
import dask.dataframe as dd
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
import holidays
from pathlib import Path


to inform you about an upcoming change in our API versioning strategy that may affect your
project's dependencies. Starting from version 1.0 onwards, we will be following a loose form of
Semantic Versioning (SemVer, https://semver.org) to provide clearer communication regarding any
potential breaking changes.

This means that while we strive to maintain backward compatibility, there might be occasional
updates that introduce breaking changes to our API. To ensure the stability of your projects,
we highly recommend pinning the version of our API that you rely on. You can pin your current
holidays v0.x dependency (e.g., holidays==0.52) or limit it (e.g., holidays<1.0) in order to
avoid potentially unwanted upgrade to the version 1.0 when it's released (ETA 2024Q4 - 2025Q1).

If you have any questions or concerns regarding this change, please don't hesitate to reach out
to us via https://github.com/vacanza/python-holidays/discussions/1800.



In [2]:
#Load new database analysis of Freguesia associations
base_dir = Path.home()
manipulated_data_directory = base_dir / "NaMinhaRua" / "Ficheiros de dados Manipulados"
df = pd.read_csv(manipulated_data_directory/ 'gopidatasetmodel1_granularidade_freguesia.csv')
df.head(2)


# Number of lines
num_linhas = len(df)
print(f"Número de linhas: {num_linhas}")


Número de linhas: 774072


In [3]:
# Create binary columns for each "Freguesia"
freguesias_dummies = pd.get_dummies(df['Freguesia'])

# Convert values from 1/0 to True/False
freguesias_dummies = freguesias_dummies.astype(bool)

# Concatenate the new columns to the original dataframe
df = pd.concat([df, freguesias_dummies], axis=1)

# Remove the original 'Freguesia' column
df = df.drop('Freguesia', axis=1)

# Show the result dataframe
print(df.head())

   dt_registo    Subseccao  BNFDRB   CA  CDPC2RD  CDRD  CDEEP  DDP  \
0  2018-01-01  11066202005     0.0  1.0      0.0   0.0    0.0  0.0   
1  2018-01-01  11061802202     0.0  0.0      0.0   0.0    0.0  0.0   
2  2018-01-01  11065602201     0.0  0.0      0.0   0.0    0.0  0.0   
3  2018-01-01  11066601201     0.0  0.0      0.0   0.0    0.0  0.0   
4  2018-01-01  11060200605     0.0  0.0      0.0   0.0    0.0  0.0   

   EOVRDJOPANVP   EA  ...  Marvila  Misericórdia  Olivais  Parque das Nações  \
0           0.0  1.0  ...    False         False    False               True   
1           0.0  0.0  ...    False         False    False              False   
2           2.0  1.0  ...    False         False    False              False   
3           1.0  0.0  ...    False         False    False              False   
4           0.0  0.0  ...    False         False    False              False   

   Penha de França  Santa Clara  Santa Maria Maior  Santo António  \
0            False        Fal

In [4]:
# Database total rows
num_linhas = len(df)
print(f"Número de linhas: {num_linhas}")

Número de linhas: 774072


In [5]:
#Create a new data model - with association rules for Freguesia.
df.to_csv(manipulated_data_directory/ 'gopidatasetmodel1_Regras_de_Associação_Freguesia.csv',sep=',', index=False)

In [6]:
#Load the database:
df = pd.read_csv(manipulated_data_directory/ 'gopidatasetmodel1_Regras_de_Associação_Freguesia.csv')

In [7]:
print(df.head())

   dt_registo    Subseccao  BNFDRB   CA  CDPC2RD  CDRD  CDEEP  DDP  \
0  2018-01-01  11066202005     0.0  1.0      0.0   0.0    0.0  0.0   
1  2018-01-01  11061802202     0.0  0.0      0.0   0.0    0.0  0.0   
2  2018-01-01  11065602201     0.0  0.0      0.0   0.0    0.0  0.0   
3  2018-01-01  11066601201     0.0  0.0      0.0   0.0    0.0  0.0   
4  2018-01-01  11060200605     0.0  0.0      0.0   0.0    0.0  0.0   

   EOVRDJOPANVP   EA  ...  Marvila  Misericórdia  Olivais  Parque das Nações  \
0           0.0  1.0  ...    False         False    False               True   
1           0.0  0.0  ...    False         False    False              False   
2           2.0  1.0  ...    False         False    False              False   
3           1.0  0.0  ...    False         False    False              False   
4           0.0  0.0  ...    False         False    False              False   

   Penha de França  Santa Clara  Santa Maria Maior  Santo António  \
0            False        Fal

In [8]:
# Change for boolean type
for col in df.select_dtypes(include='number').columns:
           df[col] = df[col].astype(bool)
        
# Relevant columns for basket
relevant_columns = [
    'BNFDRB', 'CA', 'CDPC2RD', 'CDRD', 'CDEEP', 'DDP', 'EOVRDJOPANVP', 'EA',
    'FDDDCDPC2ROSM', 'G', 'LDVPDDPVELDVP', 'OIEVPER', 'PED', 'RNADRDDRSU',
    'RSRPDP', 'RJPDR', 'RMPDR', 'RRPDR', 'SOOLA', 'ÁAORM', 'Feriado', 
    'Fim_de_Semana', 'Inverno', 'Primavera', 'Verao', 'Outono',
    'Parque das Nações', 'Lumiar', 'Arroios', 'Santo António', 'Alcântara',
    'Olivais', 'São Domingos de Benfica', 'Avenidas Novas', 'Belém', 
    'Misericórdia', 'Santa Clara', 'Areeiro', 'São Vicente', 'Marvila',
    'Alvalade', 'Penha de França', 'Santa Maria Maior', 'Beato', 'Ajuda', 
    'Benfica', 'Campo de Ourique', 'Carnide', 'Estrela', 'Campolide'
]

# Keep only the relevant columns in the basket
basket = df[relevant_columns]

# Optimize data types by converting boolean columns to int8 (takes less memory)
basket = basket.astype('int8')

# Apply FP-Growth algorithm with minimum support
frequent_itemsets = fpgrowth(basket, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print("Itemsets frequentes:\n", frequent_itemsets.head(25))




Itemsets frequentes:
        support                      itemsets
24    0.932356                       (RMPDR)
6     0.694142                (EOVRDJOPANVP)
832   0.661606         (EOVRDJOPANVP, RMPDR)
25    0.425990                       (RJPDR)
1969  0.421302                (RJPDR, RMPDR)
30    0.404539                         (PED)
2394  0.392476                  (PED, RMPDR)
0     0.362783                          (CA)
26    0.358279                  (RNADRDDRSU)
1972  0.352240           (RMPDR, RNADRDDRSU)
45    0.347463                   (RMPDR, CA)
1970  0.324520         (RJPDR, EOVRDJOPANVP)
1971  0.322101  (RJPDR, EOVRDJOPANVP, RMPDR)
41    0.315289               (LDVPDDPVELDVP)
2395  0.305743           (PED, EOVRDJOPANVP)
3880  0.300809        (RMPDR, LDVPDDPVELDVP)
2397  0.299202    (PED, EOVRDJOPANVP, RMPDR)
11    0.297326                        (CDRD)
1173  0.291251                 (RMPDR, CDRD)
3     0.291147                       (ÁAORM)
42    0.281323                   

In [9]:
# Generate association rules with lift metric and minimum threshold
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.01)
rules = rules.sort_values(by=['lift', 'confidence'], ascending=[False, False])
print("Regras de associação:\n", rules.head(30))

Regras de associação:
                                      antecedents  \
57499                                   (Lumiar)   
57494                             (CDEEP, ÁAORM)   
49952                            (RMPDR, Lumiar)   
49949                     (CDEEP, LDVPDDPVELDVP)   
45702                     (CDEEP, LDVPDDPVELDVP)   
45707                                   (Lumiar)   
49957                                   (Lumiar)   
49944              (CDEEP, RMPDR, LDVPDDPVELDVP)   
61717                   (G, EOVRDJOPANVP, SOOLA)   
61736                           (RMPDR, Arroios)   
61712            (RMPDR, G, EOVRDJOPANVP, SOOLA)   
61741                                  (Arroios)   
59014                                  (Arroios)   
59003                   (G, EOVRDJOPANVP, SOOLA)   
61371         (LDVPDDPVELDVP, EOVRDJOPANVP, DDP)   
61386                           (RMPDR, Arroios)   
61366  (LDVPDDPVELDVP, RMPDR, EOVRDJOPANVP, DDP)   
61391                                  (A

In [10]:
# Rules, filter = 'Fim_de_Semana'
rules_weekend = rules[(rules['antecedents'].apply(lambda x: 'Fim_de_Semana' in str(x))) |
                      (rules['consequents'].apply(lambda x: 'Fim_de_Semana' in str(x)))]

print("Regras que envolvem 'Fim_de_Semana':\n", rules_weekend.head(20))

Regras que envolvem 'Fim_de_Semana':
                          antecedents                    consequents  \
47058                (Fim_de_Semana)                      (Inverno)   
47059                      (Inverno)                (Fim_de_Semana)   
30074                (Fim_de_Semana)                       (Outono)   
30075                       (Outono)                (Fim_de_Semana)   
45700                (Fim_de_Semana)                    (Primavera)   
45701                    (Primavera)                (Fim_de_Semana)   
52258                (Fim_de_Semana)                            (G)   
52259                            (G)                (Fim_de_Semana)   
24874                 (EOVRDJOPANVP)         (Fim_de_Semana, RMPDR)   
24871         (Fim_de_Semana, RMPDR)                 (EOVRDJOPANVP)   
41252                (Fim_de_Semana)                (LDVPDDPVELDVP)   
41253                (LDVPDDPVELDVP)                (Fim_de_Semana)   
6031                  (EOVRDJOPANVP)   

In [11]:
# Rules, filter = 'Inverno', 'Verao', 'Primavera', 'Outono'
seasons = ['Inverno', 'Verao', 'Primavera', 'Outono']
rules_seasons = rules[rules['antecedents'].apply(lambda x: any(season in str(x) for season in seasons)) |
                      rules['consequents'].apply(lambda x: any(season in str(x) for season in seasons))]

print("Regras que envolvem estações:\n", rules_seasons.head(30))

Regras que envolvem estações:
                                  antecedents  \
61082                        (Outono, SOOLA)   
61083          (FDDDCDPC2ROSM, EOVRDJOPANVP)   
61077          (Outono, EOVRDJOPANVP, SOOLA)   
61088                        (FDDDCDPC2ROSM)   
45545                 (Outono, RMPDR, SOOLA)   
45556                        (FDDDCDPC2ROSM)   
45552                        (Outono, SOOLA)   
45549                 (FDDDCDPC2ROSM, RMPDR)   
43482                        (FDDDCDPC2ROSM)   
43479                        (Outono, SOOLA)   
59722             (Primavera, LDVPDDPVELDVP)   
59723                                (CDEEP)   
45054     (RMPDR, Outono, PED, EOVRDJOPANVP)   
45091            (LDVPDDPVELDVP, RNADRDDRSU)   
61643          (RMPDR, Outono, EOVRDJOPANVP)   
61630        (LDVPDDPVELDVP, CA, RNADRDDRSU)   
45067            (Outono, PED, EOVRDJOPANVP)   
45078     (LDVPDDPVELDVP, RMPDR, RNADRDDRSU)   
44604            (Outono, PED, EOVRDJOPANVP)   
44617    

In [12]:
# Rules, filter = 'Feriado'
rules_feriado = rules[(rules['antecedents'].apply(lambda x: 'Feriado' in str(x))) |
                      (rules['consequents'].apply(lambda x: 'Feriado' in str(x)))]

print("Regras que envolvem 'Feriado':\n", rules_feriado)

Regras que envolvem 'Feriado':
 Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []


In [13]:
# Rules, filter for parish basket
freguesias = ['Parque das Nações', 'Lumiar', 'Arroios', 'Santo António', 'Alcântara',
    'Olivais', 'São Domingos de Benfica', 'Avenidas Novas', 'Belém', 
    'Misericórdia', 'Santa Clara', 'Areeiro', 'São Vicente', 'Marvila',
    'Alvalade', 'Penha de França', 'Santa Maria Maior', 'Beato', 'Ajuda', 
    'Benfica', 'Campo de Ourique', 'Carnide', 'Estrela', 'Campolide']
rules_freguesias = rules[rules['antecedents'].apply(lambda x: any(freguesia in str(x) for freguesia in freguesias)) |
                      rules['consequents'].apply(lambda x: any(freguesia in str(x) for freguesia in freguesias))]

print("Regras que envolvem Freguesias:\n", rules_freguesias.head(30))

Regras que envolvem Freguesias:
                                      antecedents  \
57499                                   (Lumiar)   
57494                             (CDEEP, ÁAORM)   
49952                            (RMPDR, Lumiar)   
49949                     (CDEEP, LDVPDDPVELDVP)   
45702                     (CDEEP, LDVPDDPVELDVP)   
45707                                   (Lumiar)   
49957                                   (Lumiar)   
49944              (CDEEP, RMPDR, LDVPDDPVELDVP)   
61717                   (G, EOVRDJOPANVP, SOOLA)   
61736                           (RMPDR, Arroios)   
61712            (RMPDR, G, EOVRDJOPANVP, SOOLA)   
61741                                  (Arroios)   
59014                                  (Arroios)   
59003                   (G, EOVRDJOPANVP, SOOLA)   
61371         (LDVPDDPVELDVP, EOVRDJOPANVP, DDP)   
61386                           (RMPDR, Arroios)   
61366  (LDVPDDPVELDVP, RMPDR, EOVRDJOPANVP, DDP)   
61391                          

In [14]:
# This code only serves to present the data in a shorter form in chapter 7 of the dissertation

# Select main columns
rules_selected = rules_freguesias[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# View the formatted table
print("Regras que envolvem Freguesia (Top 20):\n")
print(rules_selected.head(20))

Regras que envolvem Freguesia (Top 20):

                                     antecedents  \
57499                                   (Lumiar)   
57494                             (CDEEP, ÁAORM)   
49952                            (RMPDR, Lumiar)   
49949                     (CDEEP, LDVPDDPVELDVP)   
45702                     (CDEEP, LDVPDDPVELDVP)   
45707                                   (Lumiar)   
49957                                   (Lumiar)   
49944              (CDEEP, RMPDR, LDVPDDPVELDVP)   
61717                   (G, EOVRDJOPANVP, SOOLA)   
61736                           (RMPDR, Arroios)   
61712            (RMPDR, G, EOVRDJOPANVP, SOOLA)   
61741                                  (Arroios)   
59014                                  (Arroios)   
59003                   (G, EOVRDJOPANVP, SOOLA)   
61371         (LDVPDDPVELDVP, EOVRDJOPANVP, DDP)   
61386                           (RMPDR, Arroios)   
61366  (LDVPDDPVELDVP, RMPDR, EOVRDJOPANVP, DDP)   
61391                  