# Comandos para baixar bibliotecas/ baixar arquivos



In [None]:
!pip install gender-guesser
!wget https://data.brasil.io/dataset/genero-nomes/nomes.csv.gz

--2022-01-24 20:10:12--  https://data.brasil.io/dataset/genero-nomes/nomes.csv.gz
Resolving data.brasil.io (data.brasil.io)... 104.26.9.175, 104.26.8.175, 172.67.71.45, ...
Connecting to data.brasil.io (data.brasil.io)|104.26.9.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1895785 (1.8M) [application/gzip]
Saving to: ‘nomes.csv.gz.2’


2022-01-24 20:10:12 (51.3 MB/s) - ‘nomes.csv.gz.2’ saved [1895785/1895785]



# Imports de Bibliotecas

In [None]:
import json
import sys
import requests
import csv
import gzip
import io
from bs4 import BeautifulSoup
import re
import tensorflow as tf
from urllib.request import urlopen  
import gender_guesser.detector as gender
from unicodedata import normalize

# Classificador de Genero

In [None]:
g = gender.Detector(case_sensitive=False)

def encode(name):
    ascii_name = normalize("NFKD", name).encode("ascii", errors="ignore").decode("ascii")
    return ascii_name.upper()

def load_data():
    fobj = io.TextIOWrapper(gzip.open("nomes.csv.gz"), encoding="utf-8")
    csv_reader = csv.DictReader(fobj)
    data = {
        row["first_name"]: row["classification"]
        for row in csv_reader
    }
    fobj.close()
    return data

name_data = load_data()

In [None]:
def classify_download(name):
    encoded_name = encode(name)
    if encoded_name in name_data:
      return name_data[encoded_name]
    return 'A'

In [None]:
def get_name(name):
  return name.split()[0]

def guess(name_to_gender):
  name_to_gender
  genderName = g.get_gender(name=name_to_gender)
  if (genderName) == 'male': 
    return 'M'
  else:
    if (genderName == 'female'):
        return 'F'
    else:
        genderName = classify_download(name_to_gender)
        if genderName != 'A':
          return genderName
        else:
          return  '?'

def guess_name(name):
  new_name = get_name(name)
  return guess(new_name)

Testes e exemplos de uso das funções

In [None]:
y = get_name('Mariana Alencar')
print(y)

resp = guess('Mariana')
print(resp)

y = get_name('Kenzo Shiraishi')
print(y)
resp = guess(y)
print(resp)

print(guess_name('Lucas Kawabata'))

Mariana
F
Kenzo
M
M


# Criação dos comandos cypher

## UFAM

In [90]:
class ResearcherUFAM(object):
    def __init__(self, name='',gender='',email=''):
        self.name = name
        self.gender = gender
        self.email = email
    def toCypher(self):
        return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender + "', email: '" + self.email + "'});"


In [91]:
print("Starting scrap for UFAM")

url_ufma = 'https://icomp.ufam.edu.br/corpo-docente.html'
page_ufma = requests.get(url_ufma)
soup_ufma = BeautifulSoup(page_ufma.text, "html5lib")
ufam_professors = list()

Starting scrap for UFAM


In [92]:
lista_ufam = soup_ufma.find('div', attrs={"class":"item-page"})

for campo in lista_ufam.find_all('nav'):
    contador = 0;
    # para achar o nome do professor
    for entry in campo.find_all('h2'):
        op = contador % 2
        if op == 0:
            name = entry.text.strip()
        contador += 1   
    email = campo.text.split("Email:")[1].strip()
    gender = guess_name(name)
    ufam_professors.append(ResearcherUFAM(name, gender, email))

with open('docentes-ufam.cypher','w', encoding='ISO-8859-1') as f:
    f.write("CREATE(n:Institution {name: 'UFAM', color: '#00FFFF'});\n")
    for r in ufam_professors:
        f.write(r.toCypher()+ "\n")
        f.write("MATCH(i:Institution {name: 'UFAM'}),(a:Author {name: '"+r.name+"'}) MERGE (a)-[r:ASSOCIATED_TO]->(i);\n")

    

## UFMG

In [93]:
class ResearcherUFMG(object):
    def __init__(self, name='', gender='',email=''):
        self.name = name
        self.gender = gender
        self.email = email
    def toCypher(self):
        return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender + "', email: '" + self.email + "'});"


In [94]:
print("Starting scrap for UFMG")
url_ufmg = 'https://ppgcc.dcc.ufmg.br/docentes/'
page_ufmg = requests.get(url_ufmg)

Starting scrap for UFMG


In [102]:
ufmg_professors = list()
soup_ufmg = BeautifulSoup(page_ufmg.text, 'html.parser')
table_ufmg = soup_ufmg.find('ul', attrs={"class":"professor"})

for campo in table_ufmg.find_all('li'):
    print(campo)

<li><article class="professor"><p class="professor-nome">Adriano Alonso Veloso</p><div class="mail"><a href="mailto:adrianov@dcc.ufmg.br">eMail</a>    <a href="http://homepages.dcc.ufmg.br/~adrianov/"> www </a></div><img src="https://ppgcc.dcc.ufmg.br/wp-content/themes/wp-cedecom/capg/files/professores/696.jpg"><p class="professor-texto">(Doutor, UFMG, Brasil, 2009) Mineração de Dados, Aprendizado de Máquina, Bancos de Dados).</p></img></article></li>
<li><article class="professor"><p class="professor-nome">Adriano César Machado Pereira</p><div class="mail"><a href="mailto:adrianoc@dcc.ufmg.br">eMail</a>    <a href="http://homepages.dcc.ufmg.br/~adrianoc/"> www </a></div><img src="https://ppgcc.dcc.ufmg.br/wp-content/themes/wp-cedecom/capg/files/professores/679.jpg"><p class="professor-texto">(Doutor, UFMG, 2007)
Análise de Desempenho, Aplicações Web, Comércio Eletrônico, Sistemas Paralelos e Distribuídos. </p></img></article></li>
<li><article class="professor"><p class="professor-no

## UFRN


In [None]:
class ResearcherUFRN(object):
    def __init__(self, name='', gender='', email='', lattes=''):
        self.name = " ".join([s.capitalize() for s in name.split()])
        self.email = email
        self.lattes = lattes
        self.gender = gender
    def toCypher(self):
        if self.lattes != '':
            return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender + "', email: '" + self.email + "', lattesurl: '" + self.lattes + "'});"
        else:
            return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender + "', email: '" + self.email + "'});"


In [None]:
print("Starting scrap for UFRN")



## UNB

In [None]:
print("Starting scraping of UNB professors")

class ResearcherUnB(object):
    def __init__(self, nome='', gender='', email='', titulo='', areas='', titulo_ano_local='', lattes=''):
        self.name = nome
        self.email = email
        self.lattesurl = lattes
        self.titulo = titulo
        self.titulo_ano_local = titulo_ano_local
        self.areas = areas
        self.gender = gender        
                
    def toCypher(self):
        return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender +"', lattesurl: '" + self.lattesurl + "', email: '" + self.email + "', title: '" + self.titulo+"', title_when_where: '" + self.titulo_ano_local+"', areas: '" + self.areas + "'});"



Starting scraping of UNB professors


In [None]:
try:
    pagina_unb = requests.get("http://ppgi.unb.br/index.php?option=com_content&view=article&id=78&Itemid=471&lang=pt")
except Exception as e:
    print("Erro ao ler a pagina %s", str(e))
    sys.exit(1)

In [None]:
pagina_unb.encoding="ISO-8859-1"
soup_unb = BeautifulSoup(pagina_unb.text, 'html.parser')
docentes_unb = list()
tabela_docentes_unb = soup_unb.find('table', attrs={"class":"docentes"})

contador = 0
nome = ''
email = ''
areas = ''
genero = ''
titulo = ''
titulo_ano_local = ''
lattes = ''


for campo in tabela_docentes_unb.find_all('td'):
    
    desloc = contador % 8
    if desloc == 1:
        
        nome = campo.text.split('(')[0].strip()
        genero = guess_name(nome)
        lattes = campo.find_all('a')[-1]['href']
    if desloc == 2:
        titulo = campo.text.split(':')[1].strip()
    if desloc == 3:
        titulo_ano_local = campo.text
    if desloc == 4:
        email = campo.text.split(':')[1].strip().replace(" [at] ", "@")
    if desloc == 5:
        areas = campo.text.split(':')[1].strip()
        docentes_unb.append(ResearcherUnB(nome, genero, email, titulo, areas, titulo_ano_local, lattes))      
    contador += 1    


with open("docentes-unb.cypher",'w', encoding='ISO-8859-1') as f:
    f.write("CREATE(n:Institution {name: 'UnB', color:'#E466CB'});\n")
    for d in docentes_unb:
        f.write(d.toCypher()+'\n')
        f.write("MATCH(i:Institution {name: 'UnB'}),(a:Author {name: '"+d.name+"'}) MERGE (a)-[r:ASSOCIATED_TO]->(i);\n")

print("Finished!!")


Finished!!


## USP

In [None]:
class ResearcherUSP(object):
    def __init__(self, name='', gender = '', email=''):
        self.name = name
        self.email = email
        self.gender = gender
    def toCypher(self):
        return "CREATE(a:Author {name: '" + self.name + "', gender: '" + self.gender + "'});"


In [None]:
usp_url = 'https://www.ime.usp.br/pos-computacao/orientadores/'
uspResearchers = list()

In [None]:
page_usp = requests.get(usp_url)

In [None]:
page_usp.encoding="UTF-8"

soup_usp = BeautifulSoup(page_usp.text, 'html.parser')
maincolumn = soup_usp.find('figure', attrs={"class":"wp-block-table"})

contador = 0
for info_orientador in maincolumn.find_all('td'):
    desloc = contador % 2
    if desloc == 0:
        name = info_orientador.find('a').text
        gender = guess_name(name)
        uspResearchers.append(ResearcherUSP(name, gender))
    contador += 1

with open('docentes-usp.cypher','w', encoding='UTF-8') as f:
    f.write("CREATE(n:Institution {name: 'USP', color: '#800000'});\n")
    for r in uspResearchers:
        f.write(r.toCypher()+ "\n")
        f.write("MATCH(i:Institution {name: 'USP'}),(a:Author {name: '"+r.name+"'}) MERGE (a)-[r:ASSOCIATED_TO]->(i);\n")
    f.close()     
print("Finished !!")


Finished !!
