In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [None]:
def obter_cidade_por_cep(cep):
    # Monta a URL da API ViaCEP com o CEP fornecido
    url = f'https://viacep.com.br/ws/{cep}/json/'

    try:
        # Faz uma solicitação GET para a API ViaCEP
        response = requests.get(url)
        # Verifica se a solicitação foi bem-sucedida (código 200)
        if response.status_code == 200:
            # Converte a resposta para formato JSON
            data = response.json()
            # Obtém o nome da cidade a partir dos dados da resposta
            cidade = data.get('localidade')
            return cidade
        else:
            print(f"Erro na solicitação. Código de status: {response.status_code}")
    except Exception as e:
        print(f"Erro ao processar a solicitação: {e}")


def remove_special_characters(input_string):
    # Define the characters to be removed
    characters_to_remove = ['.', '-']

    # Use str.translate() method to remove specified characters
    translation_table = str.maketrans('', '', ''.join(characters_to_remove))
    result_string = input_string.translate(translation_table)

    return result_string


def get_text_after_slash(input_string):
    # Find the position of the '/' character
    slash_index = input_string.find('/')

    # Check if '/' is found and return the substring after it
    if slash_index != -1:
        return input_string[slash_index + 1:]
    else:
        # Return an empty string or raise an exception based on your requirement
        return ""



def get_text_between_secretario_and_newline(input_string):
    # Find the position of 'Secretário'
    secretario_index = input_string.find('Secretário')

    # Check if 'Secretário' is found
    if secretario_index != -1:
        # Find the position of the next newline character
        newline_index = input_string.find('\n', secretario_index)

        # Check if a newline character is found after 'Secretário'
        if newline_index != -1:
            # Get the substring between 'Secretário' and the next newline character
            substring_between_secretario_and_newline = input_string[secretario_index + len('Secretário'):newline_index].strip()

            return substring_between_secretario_and_newline
        else:
            # Return an empty string or raise an exception based on your requirement
            return ""
    else:
        # Return an empty string or raise an exception based on your requirement
        return ""


def get_words_between_secretarias_and_secretario(input_string):
    # Find the position of 'Secretarias' and 'Secretario'
    secretarias_index = input_string.find('Secretarias')
    secretario_index = input_string.find('Secretário')

    # Check if both 'Secretarias' and 'Secretario' are found
    if secretarias_index != -1 and secretario_index != -1:
        # Get the substring between 'Secretarias' and 'Secretario'
        substring_between = input_string[secretarias_index + len('Secretarias'):secretario_index].strip()

        # Split the substring into words
        words = substring_between.split()

        return words
    else:
        # Return an empty list or raise an exception based on your requirement
        return []

def extract_words_between_secretarias_and_endereco(input_string):
    # Define the pattern to match everything between 'SECRETARIAS' and 'Endereço'
    pattern = re.compile(r'HOME(.*?)(?=Endereço)', re.DOTALL | re.IGNORECASE)

    # Find all matches in the input string
    matches = pattern.findall(input_string)

    # Concatenate all matches and split into words
    words = ' '.join(matches).split()

    return words

def split_list_at_colon(input_list):
    # Find the index of ':'
    colon_index = input_list.index(':')

    # Split the list into two sublists
    sublist1 = input_list[:colon_index + 1]
    sublist2 = input_list[colon_index + 1:]

    return sublist1, sublist2

In [None]:
url = 'https://www.consed.org.br/secretarios'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#soup

In [None]:


# Extracting data based on HTML tags and attributes
# Extract text from the specified elements
div_class = 'content'
namelist=[]
citylist=[]
ccitlist=[]

div_element = soup.findAll('div', class_=div_class)
for htmlfile in div_element:
  name=htmlfile.find('strong').text
  namelist.append(name)
  city=htmlfile.find('p').text
  citylist.append(city)

print(type(citylist[0].split()[-1]))

for i in range(len(citylist)):
  cleancep = remove_special_characters(citylist[i].split()[-1])
  ccit = obter_cidade_por_cep(cleancep)
  ccitlist.append(ccit)

ccitlist[1] = 'Maceió'
ccitlist[2] = 'Manaus'
ccitlist[3] = 'Macapá'
ccitlist[4] = 'Salvador'
ccitlist[7] = 'Vitoria'

print(ccitlist)



In [None]:
uflist=[]
for i in range(len(citylist)):
  getuf = remove_special_characters(citylist[i].split()[-3])
  uflist.append(get_text_after_slash(getuf))
uflist

In [None]:
funclist=['Secretário(a) de Educação']*len(namelist)

column_names = ['Nome', 'Funcao', 'Cidade', 'UF']

# Use zip to pair column names with data
data = list(zip(column_names, [namelist, funclist, ccitlist, uflist]))

# Create a DataFrame
df = pd.DataFrame(dict(data))

# Print the DataFrame
print(df)

In [None]:
finalnames=[]
finalfun=[]
desired_numbers = [136, 149, 455, 140, 456, 141, 142, 143, 133, 144, 145, 146, 148, 139]  # Replace with your desired numbers

for number in desired_numbers:
  url = f'https://ouropreto.mg.gov.br/secretaria/{number:04d}'
  #print(url)
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  txt = soup.get_text()
  wordvec = get_text_between_secretario_and_newline(txt).split()
  wordvec= wordvec[1:]
  result_string = ' '.join(wordvec)
  #get_words_between_secretarias_and_secretario(soup.get_text())
  vec=get_words_between_secretarias_and_secretario(txt)
  size=len(get_words_between_secretarias_and_secretario(txt))
  sci=size//2
  vec=vec[0:sci]
  strr = ' '.join(vec)
  finalnames.append(result_string)
  finalfun.append(strr)

fuflist=['MG']*len(finalfun)
oplist=['Ouro Preto']*len(finalfun)



In [None]:
column_names = ['Nome', 'Funcao', 'Cidade', 'UF']
# Use zip to pair column names with data
data = list(zip(column_names, [finalnames, finalfun, oplist, fuflist]))
# Create a DataFrame
df2 = pd.DataFrame(dict(data))
# Print the DataFrame
finaldf = pd.concat([df, df2])
print(finaldf)

In [None]:
def substitute_secretaria(input_list):
    # Iterate through the list and substitute "SECRETARIA" with "SECRETARIO(A) DE"
    result_list = [word.replace('SECRETARIAS', 'SECRETARIO(A) DE') for word in input_list]

    return result_list



In [None]:
url = 'https://municipio.jaguariuna.sp.gov.br/secretarias/7/juventude-esportes-e-lazer.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
ll=extract_words_between_secretarias_and_endereco(soup.get_text())
l1, l2 = split_list_at_colon(ll)
l1=l1[:-2]
l1=substitute_secretaria(l1)
name=' '.join(l2)
func=' '.join(l1)
func = func.title()


'Rafael da Silva Blanco'

In [None]:
reptition=['4/educacao.html',
'2/administracao-e-financas.html',
 '3/assistencia-social.html',
 '1/desenvolvimento-economico-e-social.html',
 '6/governo.html',
 '7/juventude-esportes-e-lazer.html',
 '8/meio-ambiente.html',
 '9/mobilidade-urbana.html',
 '10/negocios-juridicos.html',
 '11/obras-e-servicos.html',
 '12/planejamento-urbano.html',
 '13/saude.html',
 '15/seguranca-publica.html',
 '14/turismo-e-cultura.html']
nm=[]
ff=[]
for rep in reptition:
  url=f'https://municipio.jaguariuna.sp.gov.br/secretarias/{rep}'
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  ll=extract_words_between_secretarias_and_endereco(soup.get_text())
  l1, l2 = split_list_at_colon(ll)
  l1=l1[:-2]
  l1=substitute_secretaria(l1)
  name=' '.join(l2)
  func=' '.join(l1)
  func = func.title()
  nm.append(name)
  ff.append(func)
nm[11]=nm[11][:34]


In [None]:
fuflist=['SP']*len(nm)
oplist=['Jaguariuna']*len(nm)

In [None]:
column_names = ['Nome', 'Funcao', 'Cidade', 'UF']
# Use zip to pair column names with data
data = list(zip(column_names, [nm, ff, oplist, fuflist]))
# Create a DataFrame
df3 = pd.DataFrame(dict(data))
# Print the DataFrame
finaldf = pd.concat([finaldf, df3])
finaldf.shape

(69, 4)

In [None]:
url='https://www.juazeirodonorte.ce.gov.br/secretaria.php'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
x=soup.findAll('h6')


In [None]:
#x[16].get('title')
funcs=[]
names=[]
len(x)
for i in range(16, len(x)):
  if i%2==0:
    funcs.append(x[i].get('title'))
  else:
    names.append(x[i].text)

for i in range(len(names)):
  names[i]=names[i].title()
  funcs[i]=funcs[i].title()



[' Francisco Helio Alves Da Silva ', ' Marcelo De Sousa Pinheiro ', ' Vanderlucio Lopes Pereira ', ' Wilson Soares Silva ', ' Josineide Pereira De Sousa Lima ', ' Pergentina Parente Jardim Catunda ', ' Jose Bendimar De Lima Junior ', ' Leandro Saraiva Dantas De Oliveira ', ' Jose Maria Ferreira Pontes Neto ', ' Genilda Ribeiro Oliveira ', ' Francimones Rolim De Albuquerque ', ' Claudio Sergei Luz E Silva ', ' Renato Wilamis De Lima Silva ']


In [None]:
oplist=['Juazeiro do Norte']*len(names)
fuflist=['CE']*len(names)

In [None]:
column_names = ['Nome', 'Funcao', 'Cidade', 'UF']
# Use zip to pair column names with data
data = list(zip(column_names, [names, funcs, oplist, fuflist]))
# Create a DataFrame
df4 = pd.DataFrame(dict(data))
# Print the DataFrame
finaldf = pd.concat([finaldf, df4])
finaldf.shape

(82, 4)

In [None]:
finaldf.to_csv('output.csv', index=False)