In [1]:
import dask.dataframe as dd
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
import holidays
from pathlib import Path


to inform you about an upcoming change in our API versioning strategy that may affect your
project's dependencies. Starting from version 1.0 onwards, we will be following a loose form of
Semantic Versioning (SemVer, https://semver.org) to provide clearer communication regarding any
potential breaking changes.

This means that while we strive to maintain backward compatibility, there might be occasional
updates that introduce breaking changes to our API. To ensure the stability of your projects,
we highly recommend pinning the version of our API that you rely on. You can pin your current
holidays v0.x dependency (e.g., holidays==0.52) or limit it (e.g., holidays<1.0) in order to
avoid potentially unwanted upgrade to the version 1.0 when it's released (ETA 2024Q4 - 2025Q1).

If you have any questions or concerns regarding this change, please don't hesitate to reach out
to us via https://github.com/vacanza/python-holidays/discussions/1800.



In [2]:
#Import original dataset
base_dir = Path.home()
manipulated_data_directory = base_dir / "NaMinhaRua" / "Ficheiros de dados Manipulados"
gopidataset = pd.read_csv(manipulated_data_directory/'combined_sorted_ym.gopi_data.csv')
gopidataset.head(2)

Unnamed: 0,dt_registo,area,tipo,Subseccao,Freguesia,Longitude_Subseccao,Latitude_Subseccao,month,year
0,2018-01-01,Iluminação Pública,Candeeiro apagado,11066202005,Parque das Nações,-9.092716,38.779538,1,2018
1,2018-01-01,Árvores e Espaços Verdes,"Árvores, arbustos ou relva - Manutenção",11061802202,Lumiar,-9.157347,38.772386,1,2018


In [3]:
#Create a new base model
gopidatasetmodel=gopidataset[['dt_registo', 'Subseccao', 'Freguesia', 'tipo']]
gopidatasetmodel.head(2)

gopidatasetmodel.to_csv(manipulated_data_directory/ 'gopidatasetmodel1.csv',sep=',', index=False)

In [4]:
#################################################Manipulação do novo modelo de dados#######################################
gopidatasetbasemodel1 = pd.read_csv(manipulated_data_directory/ 'gopidatasetmodel1.csv')
gopidatasetbasemodel1.head(2)

Unnamed: 0,dt_registo,Subseccao,Freguesia,tipo
0,2018-01-01,11066202005,Parque das Nações,Candeeiro apagado
1,2018-01-01,11061802202,Lumiar,"Árvores, arbustos ou relva - Manutenção"


In [5]:
# Group by 'tipo' and count occurences
tipo_frequencia = gopidatasetbasemodel1['tipo'].value_counts()

# Calculate total frequency
total_ocorrencias = tipo_frequencia.sum()

# Calculate the percentage for each type
tipo_percentagem = (tipo_frequencia / total_ocorrencias) * 100

# Show the top 20 'tipos'
top_20_percentagem = tipo_percentagem.head(20)

print(top_20_percentagem)

tipo
Remoção-Monstros-Pedido de recolha                                                         33.394310
Entulhos, objetos volumosos, resíduos de jardim ou perigosos abandonados na via pública    12.838728
Remoção-Jardins-Pedido de recolha                                                           2.758658
Candeeiro apagado                                                                           2.744318
Pragas e doenças                                                                            2.729849
Reclamações no âmbito da recolha diária de resíduos sólidos urbanos                         2.383758
Grafitis                                                                                    2.210260
Limpeza da via pública (Despejo de papeleira, varredura e lavagem da via pública)           2.065570
Árvores, arbustos ou relva - Manutenção                                                     1.805129
Remoção Seletivas - Remoção pontual de papel/cartão                                   

In [6]:
# Formate dt_registo as datatype
gopidatasetbasemodel1['dt_registo'] = pd.to_datetime(gopidatasetbasemodel1['dt_registo'], format='%Y-%m-%d')

# Create the tipo_frequencia_diaria DataFrame with the occurrence count
tipo_frequencia_diaria = gopidatasetbasemodel1.groupby(['dt_registo', 'Subseccao', 'Freguesia', 'tipo']).size().reset_index(name='counts')

# Select the 20 most frequent 'tipos'
top_20_tipos = tipo_frequencia_diaria.groupby('tipo')['counts'].sum().nlargest(20).index

# Filter the data to only include the top 20 'tipos'
tipo_frequencia_diaria = tipo_frequencia_diaria[tipo_frequencia_diaria['tipo'].isin(top_20_tipos)]

# Create a pivot DataFrame with the most frequent types as columns
pivot_df = tipo_frequencia_diaria.pivot_table(index=['dt_registo', 'Subseccao', 'Freguesia'], columns='tipo', values='counts', fill_value=0)

# Check column names before changing
print("Antes da alteração:", pivot_df.columns)

# Rename columns to initials
def rename_to_initials(col):
   # Substituir caracteres especiais por underscores
    col = col.replace(' ', '_').replace('-', '_').replace(',', '_').replace('(', '_').replace(')', '_').replace('ó', 'o').replace('õ', 'o').replace('ã', 'a').replace('á', 'a').replace('ç', 'c').replace('í', 'i').replace('â', 'a').replace('é', 'e').replace('ú', 'u')
    # Split the column name into words.
    words = col.split('_')
    # Get the first letter of each word and combine all the initials
    initials = ''.join([word[0].upper() for word in words if word])
    return initials

pivot_df.columns = [rename_to_initials(col) for col in pivot_df.columns]

# Check column names after change
print("Depois da alteração:", pivot_df.columns)

# Merges with the original DataFrame, filling unmatched cells with zeros
gopidataset_group = pd.merge(gopidatasetbasemodel1, pivot_df, on=['dt_registo', 'Freguesia', 'Subseccao'], how='left').fillna(0)

# Check the result
print(gopidataset_group.head(3))

# Remove column 'tipo'
gopidataset_group.drop(columns=['tipo'], inplace=True)

Antes da alteração: Index(['Buraco na faixa de rodagem - Betuminoso', 'Candeeiro apagado',
       'Contentor de pequena capacidade (2 rodas) desaparecido',
       'Contentores de resíduos danificados', 'Corte de ervas em passeios',
       'Descalcetamento do passeio',
       'Entulhos, objetos volumosos, resíduos de jardim ou perigosos abandonados na via pública',
       'Estacionamento abusivo',
       'Falta de despejo do contentor de pequena capacidade (2 rodas) ou sacos (municipais)',
       'Grafitis',
       'Limpeza da via pública (Despejo de papeleira, varredura e lavagem da via pública)',
       'Obras ilegais - Edificado, via pública e ruído', 'Pragas e doenças',
       'Reclamações no âmbito da recolha diária de resíduos sólidos urbanos',
       'Remoção Seletivas - Remoção pontual de papel/cartão',
       'Remoção-Jardins-Pedido de recolha',
       'Remoção-Monstros-Pedido de recolha', 'Remoção-RCD-Pedido de recolha',
       'Sacos ou outros lixos abandonados',
       'Árvo

In [7]:
# Function to check if the day is a weekend
def is_weekend(date):
    return date.weekday() >= 5  # 5 = Sábado, 6 = Domingo

gopidataset_group['Fim_de_Semana'] = gopidataset_group['dt_registo'].apply(is_weekend)

# Count how many times Fim_de_Semana is True and False
num_ocorrencias = gopidataset_group['Fim_de_Semana'].value_counts()

# Extract the counts
num_ocorrencias_fim_de_semana = num_ocorrencias[True] if True in num_ocorrencias else 0
num_ocorrencias_dia_util = num_ocorrencias[False] if False in num_ocorrencias else 0

print(f"Número de ocorrências em fins de semana: {num_ocorrencias_fim_de_semana}")
print(f"Número de ocorrências em dias úteis: {num_ocorrencias_dia_util}")



Número de ocorrências em fins de semana: 77031
Número de ocorrências em dias úteis: 697041


In [8]:
#Inser column with Portugal holidays    
portugal_holidays = holidays.Portugal(years=range(2018, 2024))  

# Function to check if the date is a holiday
def is_holiday(date):
    return date in portugal_holidays

# Apply the function to create the 'Feriado' column
gopidataset_group['Feriado'] = gopidataset_group['dt_registo'].apply(is_holiday)
gopidataset_group.head(1)

Unnamed: 0,dt_registo,Subseccao,Freguesia,BNFDRB,CA,CDPC2RD,CDRD,CDEEP,DDP,EOVRDJOPANVP,...,PED,RNADRDDRSU,RSRPDP,RJPDR,RMPDR,RRPDR,SOOLA,ÁAORM,Fim_de_Semana,Feriado
0,2018-01-01,11066202005,Parque das Nações,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True


In [9]:
# Season function:
def get_season(date):
    year = date.year
    seasons = {'primavera': pd.date_range(start=f'{year}-03-01', end=f'{year}-05-31'),
               'verao': pd.date_range(start=f'{year}-06-01', end=f'{year}-08-31'),
               'outono': pd.date_range(start=f'{year}-09-01', end=f'{year}-11-30')}
    if date in seasons['primavera']:
        return 'Primavera'
    elif date in seasons['verao']:
        return 'Verao'
    elif date in seasons['outono']:
        return 'Outono'
    else:
        return 'Inverno'

# Apply function to create 'Estação' column
gopidataset_group['Estacao'] = gopidataset_group['dt_registo'].apply(get_season)


# Transform the Season column into several "Primavera, Verao, Outono, Inverno"
gopidataset_group['Inverno'] = gopidataset_group['Estacao'] == 'Inverno'
gopidataset_group['Primavera'] = gopidataset_group['Estacao'] == 'Primavera'
gopidataset_group['Verao'] = gopidataset_group['Estacao'] == 'Verão'
gopidataset_group['Outono'] = gopidataset_group['Estacao'] == 'Outono'


gopidataset_group.head(1)
gopidataset_group=gopidataset_group.drop(columns=['Estacao'])

In [10]:
# Create new data model - With subsection level indexing granularity

gopidataset_group.to_csv(manipulated_data_directory / 'gopidatasetmodel1_granularidade_subseccao.csv',sep=',', index=False)

In [11]:
###################################################### FP-Growth ###########################################################
df = pd.read_csv(manipulated_data_directory / 'gopidatasetmodel1_granularidade_subseccao.csv')
df.head(2)


# Number of lines
num_linhas = len(df)
print(f"Número de linhas: {num_linhas}")


Número de linhas: 774072


In [12]:
df.head(2)

Unnamed: 0,dt_registo,Subseccao,Freguesia,BNFDRB,CA,CDPC2RD,CDRD,CDEEP,DDP,EOVRDJOPANVP,...,RMPDR,RRPDR,SOOLA,ÁAORM,Fim_de_Semana,Feriado,Inverno,Primavera,Verao,Outono
0,2018-01-01,11066202005,Parque das Nações,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,False,True,True,False,False,False
1,2018-01-01,11061802202,Lumiar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,False,True,True,False,False,False


In [13]:

for col in df.select_dtypes(include='number').columns:
           df[col] = df[col].astype(bool)
        
# Define columns for applying the Fp-Growth algorithm
relevant_columns =['BNFDRB', 'CA', 'CDPC2RD', 'CDRD', 'CDEEP', 'DDP', 'EOVRDJOPANVP', 'EA',
       'FDDDCDPC2ROSM', 'G', 'LDVPDDPVELDVP', 'OIEVPER', 'PED', 'RNADRDDRSU',
       'RSRPDP', 'RJPDR', 'RMPDR', 'RRPDR', 'SOOLA', 'ÁAORM', 'Feriado', 'Fim_de_Semana', 'Inverno', 'Primavera', 'Verao', 'Outono']

# Associate to a basket
basket = df[relevant_columns]

# Apply the FP-Growth algorithm with minimal support
frequent_itemsets = fpgrowth(basket, min_support=0.01, use_colnames=True)
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print("Itemsets frequentes:\n", frequent_itemsets.head(25))


Itemsets frequentes:
      support                    itemsets
11  0.395760                     (RMPDR)
24  0.274379                    (Outono)
21  0.235547                 (Primavera)
0   0.224566                   (Inverno)
4   0.159906              (EOVRDJOPANVP)
42  0.108783             (Outono, RMPDR)
19  0.099514             (Fim_de_Semana)
25  0.091771            (Inverno, RMPDR)
41  0.090598          (Primavera, RMPDR)
31  0.042787      (EOVRDJOPANVP, Outono)
30  0.041177   (EOVRDJOPANVP, Primavera)
14  0.038769                     (RJPDR)
13  0.037119                       (PED)
1   0.036565                        (CA)
12  0.032691                (RNADRDDRSU)
22  0.031284             (LDVPDDPVELDVP)
28  0.030595     (EOVRDJOPANVP, Inverno)
10  0.029110                         (G)
40  0.027408     (Fim_de_Semana, Outono)
3   0.026424                     (ÁAORM)
29  0.025564       (EOVRDJOPANVP, RMPDR)
23  0.023458                    (RSRPDP)
39  0.022954  (Fim_de_Semana, Prima

In [14]:
print(df.dtypes)

dt_registo       object
Subseccao          bool
Freguesia        object
BNFDRB             bool
CA                 bool
CDPC2RD            bool
CDRD               bool
CDEEP              bool
DDP                bool
EOVRDJOPANVP       bool
EA                 bool
FDDDCDPC2ROSM      bool
G                  bool
LDVPDDPVELDVP      bool
OIEVPER            bool
PED                bool
RNADRDDRSU         bool
RSRPDP             bool
RJPDR              bool
RMPDR              bool
RRPDR              bool
SOOLA              bool
ÁAORM              bool
Fim_de_Semana      bool
Feriado            bool
Inverno            bool
Primavera          bool
Verao              bool
Outono             bool
dtype: object


In [15]:
# Generate association rules with minimum support of 1%
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.01)
rules = rules.sort_values(by=['lift', 'confidence'], ascending=[False, False])
print("Regras de associação:\n", rules)

Regras de associação:
         antecedents      consequents  antecedent support  consequent support  \
31             (CA)        (Inverno)            0.036565            0.224566   
30        (Inverno)             (CA)            0.224566            0.036565   
27          (RJPDR)        (Inverno)            0.038769            0.224566   
26        (Inverno)          (RJPDR)            0.224566            0.038769   
20  (Fim_de_Semana)   (EOVRDJOPANVP)            0.099514            0.159906   
21   (EOVRDJOPANVP)  (Fim_de_Semana)            0.159906            0.099514   
34         (Outono)     (RNADRDDRSU)            0.274379            0.032691   
35     (RNADRDDRSU)         (Outono)            0.032691            0.274379   
29             (CA)         (Outono)            0.036565            0.274379   
28         (Outono)             (CA)            0.274379            0.036565   
8    (EOVRDJOPANVP)      (Primavera)            0.159906            0.235547   
9       (Primaver

In [16]:
# This code only serves to present the data in a shorter form in chapter 7 of the dissertation

# Select main columns
rules_selected = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# View the formatted table
print("Regras de associação (Top 25):\n")
print(rules_selected.head(25))

Regras de associação (Top 25):

        antecedents      consequents   support  confidence      lift
31             (CA)        (Inverno)  0.011261    0.307978  1.371437
30        (Inverno)             (CA)  0.011261    0.050147  1.371437
27          (RJPDR)        (Inverno)  0.011318    0.291936  1.300003
26        (Inverno)          (RJPDR)  0.011318    0.050400  1.300003
20  (Fim_de_Semana)   (EOVRDJOPANVP)  0.019311    0.194052  1.213534
21   (EOVRDJOPANVP)  (Fim_de_Semana)  0.019311    0.120764  1.213534
34         (Outono)     (RNADRDDRSU)  0.010192    0.037144  1.136227
35     (RNADRDDRSU)         (Outono)  0.010192    0.311757  1.136227
29             (CA)         (Outono)  0.011304    0.309144  1.126703
28         (Outono)             (CA)  0.011304    0.041198  1.126703
8    (EOVRDJOPANVP)      (Primavera)  0.041177    0.257507  1.093233
9       (Primavera)   (EOVRDJOPANVP)  0.041177    0.174815  1.093233
2         (Inverno)          (RMPDR)  0.091771    0.408658  1.032589
3 

In [17]:
# Rules, filter = 'Fim_de_Semana'
rules_weekend = rules[(rules['antecedents'].apply(lambda x: 'Fim_de_Semana' in str(x))) |
                      (rules['consequents'].apply(lambda x: 'Fim_de_Semana' in str(x)))]

print("Regras que envolvem 'Fim_de_Semana':\n", rules_weekend.head(20))

Regras que envolvem 'Fim_de_Semana':
         antecedents      consequents  antecedent support  consequent support  \
20  (Fim_de_Semana)   (EOVRDJOPANVP)            0.099514            0.159906   
21   (EOVRDJOPANVP)  (Fim_de_Semana)            0.159906            0.099514   
18  (Fim_de_Semana)        (Inverno)            0.099514            0.224566   
19        (Inverno)  (Fim_de_Semana)            0.224566            0.099514   
12  (Fim_de_Semana)         (Outono)            0.099514            0.274379   
13         (Outono)  (Fim_de_Semana)            0.274379            0.099514   
16  (Fim_de_Semana)      (Primavera)            0.099514            0.235547   
17      (Primavera)  (Fim_de_Semana)            0.235547            0.099514   
22  (Fim_de_Semana)          (RMPDR)            0.099514            0.395760   
23          (RMPDR)  (Fim_de_Semana)            0.395760            0.099514   

     support  confidence      lift  leverage  conviction  zhangs_metric  
20  0.0

In [18]:
# This code only serves to present the data in a shorter form in chapter 7 of the dissertation

# Select main columns
rules_selected = rules_weekend[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# View the formatted table
print("Regras que envolvem 'Fim_de_Semana:\n")
print(rules_selected.head(25))

Regras que envolvem 'Fim_de_Semana:

        antecedents      consequents   support  confidence      lift
20  (Fim_de_Semana)   (EOVRDJOPANVP)  0.019311    0.194052  1.213534
21   (EOVRDJOPANVP)  (Fim_de_Semana)  0.019311    0.120764  1.213534
18  (Fim_de_Semana)        (Inverno)  0.022635    0.227454  1.012861
19        (Inverno)  (Fim_de_Semana)  0.022635    0.100794  1.012861
12  (Fim_de_Semana)         (Outono)  0.027408    0.275422  1.003800
13         (Outono)  (Fim_de_Semana)  0.027408    0.099892  1.003800
16  (Fim_de_Semana)      (Primavera)  0.022954    0.230660  0.979256
17      (Primavera)  (Fim_de_Semana)  0.022954    0.097450  0.979256
22  (Fim_de_Semana)          (RMPDR)  0.013552    0.136179  0.344094
23          (RMPDR)  (Fim_de_Semana)  0.013552    0.034242  0.344094


In [19]:
seasons = ['Inverno', 'Verao', 'Primavera', 'Outono']
rules_seasons = rules[rules['antecedents'].apply(lambda x: any(season in str(x) for season in seasons)) |
                      rules['consequents'].apply(lambda x: any(season in str(x) for season in seasons))]

print("Regras que envolvem as estações do ano:\n", rules_seasons.head(30))


Regras que envolvem as estações do ano:
         antecedents      consequents  antecedent support  consequent support  \
31             (CA)        (Inverno)            0.036565            0.224566   
30        (Inverno)             (CA)            0.224566            0.036565   
27          (RJPDR)        (Inverno)            0.038769            0.224566   
26        (Inverno)          (RJPDR)            0.224566            0.038769   
34         (Outono)     (RNADRDDRSU)            0.274379            0.032691   
35     (RNADRDDRSU)         (Outono)            0.032691            0.274379   
29             (CA)         (Outono)            0.036565            0.274379   
28         (Outono)             (CA)            0.274379            0.036565   
8    (EOVRDJOPANVP)      (Primavera)            0.159906            0.235547   
9       (Primavera)   (EOVRDJOPANVP)            0.235547            0.159906   
2         (Inverno)          (RMPDR)            0.224566            0.395760   

In [20]:
# This code only serves to present the data in a shorter form in chapter 7 of the dissertation

# Select the main column
rules_selected = rules_seasons[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# View the formatted table
print("Regras que envolvem as estações do ano:\n")
print(rules_selected.head(25))

Regras que envolvem as estações do ano:

        antecedents      consequents   support  confidence      lift
31             (CA)        (Inverno)  0.011261    0.307978  1.371437
30        (Inverno)             (CA)  0.011261    0.050147  1.371437
27          (RJPDR)        (Inverno)  0.011318    0.291936  1.300003
26        (Inverno)          (RJPDR)  0.011318    0.050400  1.300003
34         (Outono)     (RNADRDDRSU)  0.010192    0.037144  1.136227
35     (RNADRDDRSU)         (Outono)  0.010192    0.311757  1.136227
29             (CA)         (Outono)  0.011304    0.309144  1.126703
28         (Outono)             (CA)  0.011304    0.041198  1.126703
8    (EOVRDJOPANVP)      (Primavera)  0.041177    0.257507  1.093233
9       (Primavera)   (EOVRDJOPANVP)  0.041177    0.174815  1.093233
2         (Inverno)          (RMPDR)  0.091771    0.408658  1.032589
3           (RMPDR)        (Inverno)  0.091771    0.231884  1.032589
32            (PED)         (Outono)  0.010500    0.282880  1.

In [18]:
# Rules, filter = 'Feriado'
rules_feriado = rules[(rules['antecedents'].apply(lambda x: 'Feriado' in str(x))) |
                      (rules['consequents'].apply(lambda x: 'Feriado' in str(x)))]

print("Regras que envolvem 'Feriado':\n", rules_feriado.head(30))

Regras que envolvem 'Feriado':
 Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []
