Esse notebook tem o objetivo de analisar o databse "Acompanhamento PCD" e assim extrair insights a partir dos dados e assim contribuir para nosso projeto

Importação de bibliotecas necessárias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

A função dessa seção do código corrigir valores nulos e inconsistências no dataset, como falta de títulos na coluna e linhas em branco

In [None]:
# Class for loading and processing the Excel data, setting the correct headers
class DataProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data_frame = None

    def load_data(self):
        temp_df = pd.read_excel(self.file_path, header=None)
        temp_df.columns = temp_df.iloc[2]
        self.data_frame = temp_df.drop([0, 1, 2]).reset_index(drop=True)

    def get_data(self):
        return self.data_frame


file_path = r'C:\Users\Pedro Faria\OneDrive\Documentos\GitHub\2024-2B-T10-SI08-G03\src\dados\Acompanhamento PCD (1).xlsx'
data_processor = DataProcessor(file_path)
data_processor.load_data()
df = data_processor.get_data()


In [None]:
df.columns

In [None]:
df.head()


In [None]:

# Class for checking if data cleaning or preprocessing is needed
class DataQualityChecker:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def check_null_values(self):
        return self.data_frame.isnull().sum()

    def check_duplicates(self):
        return self.data_frame.duplicated().sum()

    def check_column_types(self):
        return self.data_frame.dtypes

    def check_inconsistent_values(self):
        inconsistent_columns = {}
        for column in self.data_frame.columns:
            if self.data_frame[column].dtype == 'object':
                inconsistent_columns[column] = self.data_frame[column].value_counts()
        return inconsistent_columns


data_quality_checker = DataQualityChecker(df)

null_values = data_quality_checker.check_null_values()
print("Null values per column:\n", null_values)

duplicates = data_quality_checker.check_duplicates()
print("\nNumber of duplicate rows:", duplicates)

column_types = data_quality_checker.check_column_types()
print("\nColumn data types:\n", column_types)

inconsistent_values = data_quality_checker.check_inconsistent_values()
print("\nInconsistent values in categorical columns:\n", inconsistent_values)

In [None]:

# Class for cleaning and preserving important data
class DataCleaner:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def handle_null_values(self):
        self.data_frame['dt_destino'] = self.data_frame['Dt Destino'].fillna('Não informado')
        self.data_frame['dt_origem'] = self.data_frame['Dt Origem'].fillna('Não informado')
        self.data_frame['tx_porta'] = self.data_frame['Tx Porta'].fillna('Desconhecido')
        self.data_frame['tx_obs'] = self.data_frame['Tx Obs'].fillna('Sem Observação')
        self.data_frame = self.data_frame.dropna(subset=['Grupos PCD'])
        return self.data_frame

    def convert_column_types(self):
        self.data_frame['fl_alerta'] = pd.to_numeric(self.data_frame['Fl Alerta'], errors='coerce')  
        return self.data_frame

    def clean_inconsistent_values(self):
        self.data_frame['tx_porta'] = self.data_frame['tx_porta'].replace(['1q', '01'], '1')
        return self.data_frame
    
    def get_cleaned_data(self):
        return pd.DataFrame(self.data_frame)
    
data_cleaner = DataCleaner(df)
df_cleaned = data_cleaner.handle_null_values()
df_cleaned = data_cleaner.convert_column_types()
df_cleaned = data_cleaner.clean_inconsistent_values()

Aqui iremos separar as identificaremos as colunas mais importantes e analisaremos seus valores para futuras comparações ou gereções de gráficos

In [None]:
# Class for analyzing the unique stations in the dataset
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def get_unique_stations(self):
        unique_stations = self.data_frame['Tx Estacao Destino'].unique()
        return unique_stations


data_analyzer = DataAnalyzer(df)
unique_stations = data_analyzer.get_unique_stations()
print("Unique stations in the dataset:", unique_stations)

In [None]:
 # This class applies basic data analysis on the loaded DataFrame
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_group(self):
        return self.data_frame['Grupos PCD'].value_counts()

    def analyze_station_dest(self):
        return self.data_frame['Tx Estacao Destino'].value_counts()

    def analyze_trains(self):
        return self.data_frame['Tx Trem'].value_counts()

    def analyze_no_alerts(self):
        return self.data_frame[self.data_frame['Fl Alerta'] == 0].shape[0]

    def analyze_dest_dates(self):
        return self.data_frame['Dt Destino'].value_counts()


data_analyzer = DataAnalyzer(df)

pcd_group_analysis = data_analyzer.analyze_pcd_group()
station_dest_analysis = data_analyzer.analyze_station_dest()
train_analysis = data_analyzer.analyze_trains()
no_alerts_count = data_analyzer.analyze_no_alerts()
dest_dates_analysis = data_analyzer.analyze_dest_dates()

print("Occurrences by PCD Group:\n", pcd_group_analysis)
print("\nOccurrences by Destination Station:\n", station_dest_analysis)
print("\nOccurrences by Train:\n", train_analysis)
print("\nCount of No-Alert Incidents:", no_alerts_count)
print("\nDistribution by Destination Date:\n", dest_dates_analysis)


Nessa conjunto de código iremos analisar as ocorrências de de PCD, assim tentando achar correlções entre as ocorrências e periodos do ano e semana, além de estações.

In [None]:
# Class for analyzing the PCD occurrences by month
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_by_month(self):
        self.data_frame['dt_destino'] = pd.to_datetime(self.data_frame['Dt Destino'], errors='coerce')
        monthly_pcd = self.data_frame.groupby(self.data_frame['dt_destino'].dt.month).size()
        return monthly_pcd

    def plot_pcd_by_month(self, monthly_pcd):
        plt.figure(figsize=(10, 6))
        monthly_pcd.plot(kind='bar', color='blue')
        plt.title('PCD Occurrences by Month')
        plt.xlabel('Month')
        plt.ylabel('Number of PCD Occurrences')
        plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
        plt.grid(True)
        plt.show()


data_analyzer = DataAnalyzer(df)
monthly_pcd_analysis = data_analyzer.analyze_pcd_by_month()
data_analyzer.plot_pcd_by_month(monthly_pcd_analysis)


In [None]:
# Class for analyzing the volume of PCD by station
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_by_station(self):
        station_pcd = self.data_frame['Id Estacao Origem'].value_counts()
        return station_pcd

    def plot_pcd_by_station(self, station_pcd):
        plt.figure(figsize=(12, 6))
        station_pcd.plot(kind='bar', color='green')
        plt.title('PCD Volume by Station')
        plt.xlabel('Station')
        plt.ylabel('Number of PCD Occurrences')
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.show()


data_analyzer = DataAnalyzer(df)
station_pcd_analysis = data_analyzer.analyze_pcd_by_station()
data_analyzer.plot_pcd_by_station(station_pcd_analysis)

In [None]:
# Class for analyzing PCD occurrences by day of the week
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_by_day_of_week(self):
        self.data_frame['dt_destino'] = pd.to_datetime(self.data_frame['Dt Destino'], errors='coerce')
        self.data_frame['day_of_week'] = self.data_frame['dt_destino'].dt.dayofweek
        day_of_week_pcd = self.data_frame.groupby('day_of_week').size()
        return day_of_week_pcd

    def plot_pcd_by_day_of_week(self, day_of_week_pcd):
        day_of_week_order = ['Domingo', 'Segunda-feira', 'Terça-feira', 'Quarta-feira', 'Quinta-feira', 'Sexta-feira', 'Sábado']
        day_of_week_pcd.index = day_of_week_order

        plt.figure(figsize=(10, 6))
        day_of_week_pcd.plot(kind='bar', color='orange')
        plt.title('Ocorrências de PCD por Dia da Semana')
        plt.xlabel('Dia da Semana')
        plt.ylabel('Número de Ocorrências de PCD')
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.show()

data_analyzer = DataAnalyzer(df)
pcd_day_of_week_analysis = data_analyzer.analyze_pcd_by_day_of_week()
data_analyzer.plot_pcd_by_day_of_week(pcd_day_of_week_analysis)

Aqui será gerado um gráfico para descobrir os grupos PCD que mais aparecem com as ocorrências.

In [None]:
# Class for analyzing PCD occurrences by group
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_by_group(self):
        group_pcd = self.data_frame['Grupos PCD'].value_counts()
        return group_pcd

    def plot_pcd_by_group(self, group_pcd):
        plt.figure(figsize=(10, 6))
        group_pcd.plot(kind='bar', color='purple')
        plt.title('PCD Occurrences by Group')
        plt.xlabel('PCD Group')
        plt.ylabel('Number of PCD Occurrences')
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.show()

data_analyzer = DataAnalyzer(df)
pcd_group_analysis = data_analyzer.analyze_pcd_by_group()
data_analyzer.plot_pcd_by_group(pcd_group_analysis)


Aqui irei começar a tentar achar relações entre os valores do alerta (1 ou 0). Concluindo que alertas que tem o valor 1 são mais graves

In [None]:
# Class for analyzing PCD occurrences with and without alerts
class DataAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def analyze_pcd_with_and_without_alerts(self):
        pcd_with_alerts = self.data_frame[self.data_frame['Fl Alerta'] == 1].shape[0]
        pcd_without_alerts = self.data_frame[self.data_frame['Fl Alerta'] == 0].shape[0]
        return {'With Alerts': pcd_with_alerts, 'Without Alerts': pcd_without_alerts}

    def plot_pcd_with_and_without_alerts(self, alert_data):
        labels = list(alert_data.keys())
        values = list(alert_data.values())

        plt.figure(figsize=(6, 6))
        plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'skyblue'])
        plt.title('PCD Occurrences With and Without Alerts')
        plt.show()

data_analyzer = DataAnalyzer(df)
alert_data = data_analyzer.analyze_pcd_with_and_without_alerts()
data_analyzer.plot_pcd_with_and_without_alerts(alert_data)


In [None]:

# Analyzes the 'fl_alerta' column and related data
class AlertAnalyzer:
    """Analyzes the 'fl_alerta' column and related data"""
    
    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.data_frame.columns = self.data_frame.columns.str.strip()

    def get_alert_counts(self):
        return self.data_frame['fl_alerta'].value_counts()

    def get_alert_percentages(self):
        return self.data_frame['fl_alerta'].value_counts(normalize=True) * 100

    def filter_alerts(self, alert_value=1):
        return self.data_frame[self.data_frame['fl_alerta'] == alert_value]

    def analyze_pcd_groups(self, alert_df):
        return alert_df['Grupos PCD'].value_counts()

    def analyze_origin_stations(self, alert_df):
        return alert_df['Id Estacao Origem'].value_counts()

    def analyze_alerts_by_hour(self, alert_df):
        alert_df['Dt Origem'] = pd.to_datetime(alert_df['dt_origem'], errors='coerce')
        alert_df['hour_origin'] = alert_df['Dt Origem'].dt.hour
        return alert_df['hour_origin'].value_counts().sort_index()

    def plot_alert_distribution(self):
        sns.countplot(x='fl_alerta', data=self.data_frame)
        plt.title('Distribution of fl_alerta values')
        plt.xlabel('fl_alerta')
        plt.ylabel('Count')
        plt.show()

    def plot_pcd_groups(self, alert_df):
        plt.figure(figsize=(10,6))
        sns.countplot(y='Grupos PCD', data=alert_df, order=alert_df['Grupos PCD'].value_counts().index)
        plt.title('PCD Groups with fl_alerta == 1')
        plt.xlabel('Count')
        plt.ylabel('Grupos PCD')
        plt.show()

    def plot_alerts_by_hour(self, alert_counts_by_hour):
        plt.figure(figsize=(10,6))
        alert_counts_by_hour.plot(kind='line', marker='o')
        plt.title('Alerts with fl_alerta == 1 by Hour')
        plt.xlabel('Hour of Day')
        plt.ylabel('Number of Alerts')
        plt.xticks(range(0,24))
        plt.grid(True)
        plt.show()

    def get_alert_observations(self, alert_df):
        return alert_df['tx_obs'].unique()

    def get_numeric_correlations(self):
        numeric_cols = self.data_frame.select_dtypes(include=['float64', 'int64']).columns
        return self.data_frame[numeric_cols].corr()['fl_alerta']

alert_analyzer = AlertAnalyzer(df_cleaned)

alert_counts = alert_analyzer.get_alert_counts()
alert_percentages = alert_analyzer.get_alert_percentages()
df_alerts_1 = alert_analyzer.filter_alerts(alert_value=1)
pcd_group_counts = alert_analyzer.analyze_pcd_groups(df_alerts_1)
origin_station_counts = alert_analyzer.analyze_origin_stations(df_alerts_1)
alert_counts_by_hour = alert_analyzer.analyze_alerts_by_hour(df_alerts_1)
alert_analyzer.plot_alert_distribution()
alert_analyzer.plot_pcd_groups(df_alerts_1)
alert_analyzer.plot_alerts_by_hour(alert_counts_by_hour)
alert_observations = alert_analyzer.get_alert_observations(df_alerts_1)
alert_correlations = alert_analyzer.get_numeric_correlations()

print("Alert counts:\n", alert_counts)
print("\nAlert percentages:\n", alert_percentages)
print("\nPCD group counts in alerts:\n", pcd_group_counts)
print("\nOrigin station counts in alerts:\n", origin_station_counts)
print("\nAlert counts by hour:\n", alert_counts_by_hour)
print("\nUnique observations in alerts:\n", alert_observations)
print("\nCorrelations with 'fl_alerta':\n", alert_correlations)


In [None]:
# Class for analyzing alerts in the dataset
class AlertAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.data_frame.columns = self.data_frame.columns.str.strip()

    def filter_alerts(self, alert_value=1):
        return self.data_frame[self.data_frame['fl_alerta'] == alert_value]

    def get_stations_with_most_alerts(self, alert_df):
        station_alert_counts = alert_df['Id Estacao Origem'].value_counts()
        return station_alert_counts

    def plot_stations_with_most_alerts(self, station_alert_counts, top_n=10):
        top_stations = station_alert_counts.head(top_n)
        plt.figure(figsize=(10,6))
        sns.barplot(x=top_stations.values, y=top_stations.index, orient='h')
        plt.title(f'Top {top_n} Stations with Most Alerts (fl_alerta == 1)')
        plt.xlabel('Number of Alerts')
        plt.ylabel('Station ID')
        plt.show()

alert_analyzer = AlertAnalyzer(df_cleaned)
df_alerts_1 = alert_analyzer.filter_alerts(alert_value=1)
station_alert_counts = alert_analyzer.get_stations_with_most_alerts(df_alerts_1)
print("Stations with most alerts (fl_alerta == 1):\n", station_alert_counts)
alert_analyzer.plot_stations_with_most_alerts(station_alert_counts)

In [None]:
# Class for analyzing alerts by destination stations using 'Tx Estacao Destino'
class DestinationAlertAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.data_frame.columns = self.data_frame.columns.str.strip()

    def filter_alerts(self, alert_value=1):
        return self.data_frame[self.data_frame['fl_alerta'] == alert_value]

    def get_destination_stations_with_most_alerts(self, alert_df):
        station_alert_counts = alert_df['Tx Estacao Destino'].value_counts()
        return station_alert_counts

    def plot_destination_stations_with_most_alerts(self, station_alert_counts, top_n=10):
        top_stations = station_alert_counts.head(top_n)
        plt.figure(figsize=(10,6))
        sns.barplot(x=top_stations.values, y=top_stations.index, orient='h')
        plt.title(f'Top {top_n} Destination Stations with Most Alerts (fl_alerta == 1)')
        plt.xlabel('Number of Alerts')
        plt.ylabel('Destination Station')
        plt.show()

alert_analyzer = DestinationAlertAnalyzer(df_cleaned)
df_alerts_1 = alert_analyzer.filter_alerts(alert_value=1)
station_alert_counts = alert_analyzer.get_destination_stations_with_most_alerts(df_alerts_1)
print("Destination stations with most alerts (fl_alerta == 1):\n", station_alert_counts)
alert_analyzer.plot_destination_stations_with_most_alerts(station_alert_counts)


Nessa parte iremos avaliar o volume (agora de forma visual) para entender melhor o volume de ocorrências de cada grupo nas estações. Esta sendo considerado separadamente as estações de origem e destino.

In [None]:
# Analyzes the volume of each PCD type per origin station
class PCDVolumeAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def compute_pcd_volume_per_station(self):
        pcd_station_counts = self.data_frame.groupby(['Grupos PCD', 'Id Estacao Origem']).size().reset_index(name='count')
        return pcd_station_counts

    def plot_pcd_volume_per_station(self, pcd_station_counts, pcd_type):
        data = pcd_station_counts[pcd_station_counts['Grupos PCD'] == pcd_type]
        plt.figure(figsize=(12,6))
        sns.barplot(x='Id Estacao Origem', y='count', data=data)
        plt.title(f'Volume of {pcd_type} per Station')
        plt.xlabel('Station')
        plt.ylabel('Volume')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()

pcd_analyzer = PCDVolumeAnalyzer(df_cleaned)
pcd_station_counts = pcd_analyzer.compute_pcd_volume_per_station()
pcd_types = ['Visual', 'Cadeira Rodas', 'Mob Reduzida', '60+', 'Autista', 'Gestante', 'Oncológico', 'Obeso']

for pcd_type in pcd_types:
    pcd_analyzer.plot_pcd_volume_per_station(pcd_station_counts, pcd_type)

In [None]:
# Analyzes the volume of each PCD type per destination station
class PCDVolumeAnalyzer:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def compute_pcd_volume_per_station(self):
        pcd_station_counts = self.data_frame.groupby(['Grupos PCD', 'Id Estacao Origem']).size().reset_index(name='count')
        return pcd_station_counts

    def plot_pcd_volume_per_station(self, pcd_station_counts, pcd_type):
        data = pcd_station_counts[pcd_station_counts['Grupos PCD'] == pcd_type]
        plt.figure(figsize=(12,6))
        sns.barplot(x='Id Estacao Origem', y='count', data=data)
        plt.title(f'Volume of {pcd_type} per Station')
        plt.xlabel('Station')
        plt.ylabel('Volume')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()

pcd_analyzer = PCDVolumeAnalyzer(df_cleaned)
pcd_station_counts = pcd_analyzer.compute_pcd_volume_per_station()
pcd_types = ['Visual', 'Cadeira Rodas', 'Mob Reduzida', '60+', 'Autista', 'Gestante', 'Oncológico', 'Obeso']

for pcd_type in pcd_types:
    pcd_analyzer.plot_pcd_volume_per_station(pcd_station_counts, pcd_type)

Com base na análise realizada, os seguintes insights foram identificados:

1. **Estações com Maior Volume de Ocorrências**: As estações com os IDs 64, 9, 13, 94, 16 e 41 apresentam o maior número de ocorrências envolvendo PCD. Isso indica a necessidade de atenção especial nessas localidades para melhorar a acessibilidade e o suporte oferecido.

2. **Meses com Mais Ocorrências**: Os meses de **junho**, **julho** e **agosto** registram um aumento significativo no número de ocorrências.

3. **Horário de Pico das Ocorrências**: O maior volume de ocorrências ocorre entre as **16h e 19h**, correspondendo ao horário de pico. Isso destaca a importância de reforçar o suporte e a assistência às PCD durante esse intervalo de tempo.

## Conversão para parquet

In [None]:
from services.parquet import Conversor

conversor = Conversor()

conversor.df_to_parquet(data_cleaner.get_cleaned_data(), "./Acompanhamento_PCD.parquet")

## Upload para o S3

In [None]:
from services.aws_conn import AwsConn

aws_conn = AwsConn()

aws_conn.send_to_s3("acompanhamento_pcd.parquet", "big-data-PCD", "./acompanhamento_pcd.parquet")