<a href="https://colab.research.google.com/github/GiliardGodoi/tj-datasets/blob/main/notebooks/preprocessamento/2024_04_18_Segmentado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install "tjdatasets @ git+https://github.com/GiliardGodoi/tj-datasets@april-24"

Collecting tjdatasets@ git+https://github.com/GiliardGodoi/tj-datasets@april-24
  Cloning https://github.com/GiliardGodoi/tj-datasets (to revision april-24) to /tmp/pip-install-amk0vl8y/tjdatasets_daa6041328c74901b38eeefe64a1dcb8
  Running command git clone --filter=blob:none --quiet https://github.com/GiliardGodoi/tj-datasets /tmp/pip-install-amk0vl8y/tjdatasets_daa6041328c74901b38eeefe64a1dcb8
  Running command git checkout -b april-24 --track origin/april-24
  Switched to a new branch 'april-24'
  Branch 'april-24' set up to track remote branch 'april-24' from 'origin'.
  Resolved https://github.com/GiliardGodoi/tj-datasets to commit eede31ef99c60a46d1ff7a586f9ef436fefe4320
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: tjdatasets
  Building wheel for tjdatasets (pypr

In [None]:
import pandas as pd
import re

from pathlib import Path

In [None]:
from tjdatasets.segmentador import sentencizer, SEGMENT_EXPRESSIONS

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from ipywidgets import Box, Layout, Textarea, IntText, BoundedIntText

def display(df, index, left_column='conteudo', right_column='formatado'):
    # index = 9578 # @param {type:"number"}

    layout = Layout(width='550px', height='600px')
    left = Textarea(value=df.loc[index, left_column], layout=layout, description="Original:")
    right = Textarea(value=df.loc[index, right_column], layout=layout, description='Processado:')

    return Box([left, right])

In [None]:
BASE_FOLDER = Path('/content/drive/Shareddrives/Projeto_TJSP_Datasets/Datasets/base_abril_2024/PREPROCESSED')
OUTPUT_FOLDER = Path('/content/drive/Shareddrives/Projeto_TJSP_Datasets/Datasets/base_abril_2024/PREPROCESSED')

base_33k_filepath = BASE_FOLDER / "acordaos_principais_33k_Processado.parquet.gzip"
base_40k_filepath = BASE_FOLDER / "acordaos_principais_40k_Processado.parquet.gzip"

assert base_33k_filepath.exists()
assert base_40k_filepath.exists()


In [None]:
def __segmentar(frame: pd.DataFrame, segment_name: str, column_senteces: str):
    '''
    '''
    column_segment = f'segmento_{segment_name}'
    frame['contains'] = False

    for expression in SEGMENT_EXPRESSIONS[segment_name]:
            # pandas, assim como Python, suporta algo chamado short-circuit evaluation
            frame['contains'] = frame['contains'] | frame[column_senteces].str.contains(expression, regex=True, flags=re.I)

    return (frame.loc[lambda x: x['contains'] == True]
                .groupby(['numero_processo'])
                .agg({column_senteces: lambda values: ' '.join(values)})
                .rename(columns={column_senteces : column_segment })
            )

def obter_todos_segmentos(df : pd.DataFrame, text_column):
    '''
    '''
    print('...')
    columns_dtypes = {column: df[column].dtype for column in df.columns}
    columns_dtypes.update(dict(numero_processo='category',
                                    id_documento='category',
                                    conteudo='string',
                                    formatado_limpo='string',
                                    codigos_movimentos_temas='string')
                        )
    df = df.astype(columns_dtypes)

    # Essa é a parte do código que mais demora para executar
    frame = df[['numero_processo', 'id_documento']].copy()
    frame['sentences'] = df[text_column].apply(sentencizer)
    frame = frame.explode('sentences')

    segmento_lei     = __segmentar(frame, segment_name='lei', column_senteces='sentences')
    segmento_fato    = __segmentar(frame, segment_name='fato', column_senteces='sentences')
    segmento_decisao = __segmentar(frame, segment_name='decisao', column_senteces='sentences')
    segmento_pedido  = __segmentar(frame, segment_name='pedido', column_senteces='sentences')

    columns = [
        'numero_processo',
        'id_documento',
        'data_hora_documento',
        'codigos_movimentos_temas',
    ]

    result = pd.merge(df[columns], segmento_lei, how='left', left_on='numero_processo', right_index=True)
    result = pd.merge(result, segmento_fato, how='left', left_on='numero_processo', right_index=True)
    result = pd.merge(result, segmento_decisao, how='left', left_on='numero_processo', right_index=True)
    result = pd.merge(result, segmento_pedido, how='left', left_on='numero_processo', right_index=True)

    return result

In [None]:
for filepath in [base_33k_filepath, base_40k_filepath]:
    print('Processando...', filepath.name)

    df = pd.read_parquet(filepath)
    print('Lido...', df.shape)
    print(df.columns)

    frame = obter_todos_segmentos(df, text_column='formatado_limpo')

    tmp = str.replace(filepath.name, 'Processado', 'Segmentado')
    print('Salvando em...', tmp)
    print('Segmentos:...', frame.shape)
    print(frame.columns)
    dest = OUTPUT_FOLDER / f"{tmp}"
    frame.to_parquet(dest, compression='gzip')

    print('Salvo...', dest)

Processando... acordaos_principais_33k_Processado.parquet.gzip
Lido... (33016, 7)
Index(['numero_processo', 'id_documento', 'conteudo', 'data_hora_documento',
       'codigos_movimentos_temas', 'formatado_limpo', 'formatado_normalizado'],
      dtype='object')
...
Salvando em... acordaos_principais_33k_Segmentado.parquet.gzip
Segmentos:... (33016, 8)
Index(['numero_processo', 'id_documento', 'data_hora_documento',
       'codigos_movimentos_temas', 'segmento_lei', 'segmento_fato',
       'segmento_decisao', 'segmento_pedido'],
      dtype='object')
Salvo... /content/drive/Shareddrives/Projeto_TJSP_Datasets/Datasets/base_abril_2024/PREPROCESSED/acordaos_principais_33k_Segmentado.parquet.gzip
Processando... acordaos_principais_40k_Processado.parquet.gzip
Lido... (41374, 7)
Index(['numero_processo', 'id_documento', 'data_hora_documento', 'conteudo',
       'codigos_movimentos_temas', 'formatado_limpo', 'formatado_normalizado'],
      dtype='object')
...
Salvando em... acordaos_principais_