In [17]:
import json
import pdfplumber
import spacy
import pandas as pd
import numpy as np
import os
import unidecode
import PyPDF2
from tika import parser
from spacy.matcher import PhraseMatcher
from spacy import displacy
from progress_bar.progress_bar import printProgressBar
import torch
import gc
import multiprocessing
import sys

class School:
    def __init__(self, file_source):
        file = open(file_source, "r")
        file = json.load(file)
        temp_list = []
        for facultad in file:
            temp_list.append(facultad['escuela'])
        #print(facultad['escuela'])
        self.escuelas = [item for sublist in temp_list for item in sublist] # make the list flat
        #have the escuelas with accents in the correct form here
        self.escuelas_accent = self.escuelas
        
        print(self.escuelas)
        self.i = 0
        self.j = 0
        self.k = 0
        self.p = 0
        self.first = True
        
    def create_dictionary(self, schools):
        myDict = dict((e,i) for i,e in enumerate(schools))
        return myDict

    def unaccent_list(self, accent_list):
        unaccented_schools = []
        for sch in accent_list:
            unaccented_schools.append(unidecode.unidecode(sch).lower())
        return unaccented_schools
    
    def set_school_to_unaccent(self):
        self.escuelas = self.unaccent_list(self.escuelas)
        
    def create_dicts(self):
        #create the dicts only when schools are unaccented
        self.escuelas_unaccent_dict = self.create_dictionary(self.escuelas)
        self.escuelas_accent_dict = self.create_dictionary(self.escuelas_accent)
        
    def set_schools_accents(self, row, l):
        self.k+= 1
        printProgressBar(self.k, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        
        index = self.escuelas_unaccent_dict.get(row.lower())
        key_list = list(self.escuelas_accent_dict.keys())
        val_list = list(self.escuelas_accent_dict.values())
        try:
            position = val_list.index(index)
            return key_list[position]
        except:
            return None
        #return the value of the position, example in dict '{..., Escuela de enfermería: 37, ...}' it will return 
        #'Escuela de enfermería'
    
    def clean_spaces_text(self, text):
        new_text = " ".join(text.split())
        return(new_text)
    
    def set_nlp(self, model):
        self.nlp_model = spacy.load(model)
        
    def get_nlp(self):
        retrun 
        
    def set_matcher(self):
        self.matcher = PhraseMatcher(self.nlp_model.vocab, attr="LOWER")
        patterns = [self.nlp_model(name) for name in self.escuelas]
        #print(patterns)
        self.matcher.add("ESC", patterns)
        
    def check_file_tika(self, file_source, l):
        try:
            gc.collect()
            torch.cuda.empty_cache()
            self.p+=1
            if self.first:
                self.first = False
            if self.p >= l:
                self.first = True
                self.p = 0
            printProgressBar(self.p, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            pages_10 = []
            pages_10_l = []
            school_name_of_file = ""
            print(file_source)
            parsed_pdf = parser.from_file(file_source)
            data = parsed_pdf['content']
            if data == None:
                return 'None'
            new_data = self.clean_spaces_text(data)
            #print(new_data)
            #print(new_data[0:90000])
            new_data = unidecode.unidecode(new_data).lower()
            #print("len of data", len(new_data))
            #first 10 pages
            first_data = new_data[:6000]
            #print(new_data)
            doc = self.nlp_model(first_data)
            #print(self.matcher(doc))
            for match_id, start, end in self.matcher(doc):
                print(doc[start:end])
                return(doc[start:end])
            #last 10 pages
            last_data = new_data[-6000:]
            #print(new_data)
            doc = self.nlp_model(last_data)
            #print(self.matcher(doc))
            for match_id, start, end in self.matcher(doc):
                print(doc[start:end])
                return(doc[start:end])
            return 'No School'
        except:
            return 'No School'
        
    def check_file(self, file_source, l):
        try:
            self.i+=1
            if self.first:
                self.first = False
            if self.i >= l:
                self.first = True
                self.i = 0
            printProgressBar(self.i, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            pages_10 = []
            pages_10_l = []
            school_name_of_file = ""
            with pdfplumber.open(file_source) as pdf:
                for i in range(0,10):
                    print(pdf.pages[i].extract_text())
                    pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
                for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                    pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
                #first 10 pages
                pages_10_u = self.unaccent_list(pages_10)
                for page in pages_10_u:
                    doc = self.nlp_model(page)
                    if len(self.matcher(doc)) >=1:
                        for match_id, start, end in self.matcher(doc):
                            return(doc[start:end]) #returns at the first instance
                #last 10 pages
                pages_10_l = self.unaccent_list(pages_10_l)
                for page in pages_10_l:
                    doc = self.nlp_model(page)
                    if len(self.matcher(doc)) >=1:
                        for match_id, start, end in self.matcher(doc):
                            print(doc[start:end])
                            return(doc[start:end]) #returns at the first instance
            return "No school"
        except:
            return "No school"
    def create_training_set(self, file_source, l):
        self.j+=1
        printProgressBar(self.j, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
        
        #first 10 pages
        pages_10_u = self.unaccent_list(pages_10)
        for page in pages_10_u:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        #last 10 pages
        pages_10_l = self.unaccent_list(pages_10_l)
        for page in pages_10_l:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        return "No school"
    def print_training_set(self):
        print(self.training_set)
        

In [18]:
spacy.prefer_gpu()

True

In [19]:
schools = School("data/escuelas.json")

['Escuela de Ingeniería Civil', 'Escuela de Ingeniería Eléctrica', 'Escuela de Geología, Minas y Geofísica', 'Escuela de Ingeniería Química', 'Escuela de Ingeniería de Petroleo', 'Escuela de Ingeniería Mecanica', 'Escuela de Ingeniería Metalúrgica y Ciencia de los Materiales', 'Escuela de Agronomía', 'Escuela de Arquitectura', 'Escuela de Biología', 'Escuela de Computación', 'Escuela de Física', 'Escuela de Geoquímica', 'Escuela de Matemática', 'Escuela de Química', 'Escuela de Administración y Contaduría', 'Escuela de Antropología', 'Escuela de Estadística y Ciencias Actuariales', 'Escuela de Economía', 'Escuela de Estudios Internacionales', 'Escuela de Sociología', 'Escuela de Trabajo Social', 'Escuela de Derecho', 'Escuela de Estudios Políticos y Administrativo', 'Escuela de Ciencias Veterinarias', 'Escuela de Farmacia', 'Escuela de Artes', 'Escuela de Bibliotecología y Archivología', 'Escuela de Comunicación Social', 'Escuela de Educación', 'Escuela de Filosofía', 'Escuela de Geogr

In [20]:
#delete accents
schools.set_school_to_unaccent()

In [9]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source = "data/url_thesis_8211_with_pdf_scan_image.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [10]:
#load model set matcher for schools
schools.set_nlp('es_core_news_sm')
#schools.set_nlp('es_dep_news_trf')
schools.set_matcher()

In [11]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [12]:
# EJECUTAR HASTA AQUII!!! BAJA AL SIGUIENTE EN LA LISTA
df_splited = np.array_split(df, 20)

In [10]:
#df_splited = np.array_split(df, 20)

In [11]:
#schools = School("data/escuelas.json")

In [12]:
#delete accents
#schools.set_school_to_unaccent()

In [13]:
# #make this fucntion a vectorize so it can run in a data frame
# check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [14]:
# df_splited = np.array_split(df, 20)

In [15]:
#size of data frame
#

In [16]:
#'escuela de ingenieria metalurgica y ciencia de los materiales' == 'escuela de ingenieria metalurgica y ciencias de los materiales'

In [None]:
schools.check_file_tika('../'+'thesis_pdf_all/7055.pdf',1)

In [None]:
# parsed_pdf = parser.from_file('../'+'thesis_pdf_all/200.pdf')
  
# # saving content of pdf
# # you can also bring text only, by parsed_pdf['text'] 
# # parsed_pdf['content'] returns string 
# data = parsed_pdf['content'] 
# #   
# # Printing of content
# print(type(data))
# #print(data)
# new_data = clean_spaces_text(data)
# #print(new_data[0:90000])
# new_data = unidecode.unidecode(new_data).lower()

In [None]:
# doc = self.nlp_model(new_data)

In [None]:
# with open('../'+'thesis_pdf_all/200.pdf', mode='rb') as f:
#     reader = PyPDF2.PdfFileReader(f)
#     page = reader.getPage(1)
#     print(clean_spaces_text(page.extractText()))

In [None]:
schools.check_file('../'+'thesis_pdf_all/7055.pdf',1)

In [17]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [None]:
#Ignore this
#schools.create_training_set("../thesis_pdf/1.pdf", 1)

In [18]:
df_splited = np.array_split(df, 20)

In [15]:
df_splited[0]

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False
5258,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False
5286,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False
5285,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False
5284,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5147,5428,27-Mar-2017,Desarrollo de un sistema de ensayos de bombeo ...,"González F., Benjamín J. M.",3.1 MB,http://saber.ucv.ve/handle/10872/15416,http://saber.ucv.ve/bitstream/10872/15416/1/TE...,,thesis_pdf_all/5428.pdf,True,True,False
5146,5427,27-Mar-2017,Evaluación de un sistema acelerado de evaporac...,"Pernía Muro, Leidy Jurisbeth",8.07 MB,http://saber.ucv.ve/handle/10872/15415,http://saber.ucv.ve/bitstream/10872/15415/1/Tr...,,thesis_pdf_all/5427.pdf,True,True,False
5145,5426,27-Mar-2017,Reformación combinada de metano sobre cataliza...,"García L., Adriana L.",2.4 MB,http://saber.ucv.ve/handle/10872/15414,http://saber.ucv.ve/bitstream/10872/15414/1/Tr...,,thesis_pdf_all/5426.pdf,True,True,False
5144,5425,27-Mar-2017,Caracterización geoquímica y geológica del bas...,"Rodríguez Barrios, María Fernanda",6.57 MB,http://saber.ucv.ve/handle/10872/15413,http://saber.ucv.ve/bitstream/10872/15413/3/TO...,,thesis_pdf_all/5425.pdf,True,True,False


In [16]:
# b = pd.concat([df_splited [0],df_splited [1]],axis=0)
# b

In [17]:
# def process_docs(docs, n_processes=None):
#     # Load the model inside the subprocess, 
#     # as that seems to be the main culprit of the memory issues
#     nlp = schools.set_nlp('es_core_news_sm')

#     if not n_processes:
#         n_processes = multiprocessing.cpu_count()

#     processed_docs = [doc for doc in nlp.pipe(docs, disable=['ner', 'parser'], n_process=n_processes)]


#     # Then do what you wish beyond this point. I end up writing results out to s3.
#     pass

In [18]:
# for x in range(10):
#     # This will spin up a subprocess, 
#     # and everytime it finishes it will release all resources back to the machine.
#     with multiprocessing.Manager() as manager:
#         p = multiprocessing.Process(target=process_docs, args=(docs))
#         p.start()
#         p.join()


In [19]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[0])
values_c_0 = check_vec("../"+df_splited[0]["path"],l)
df_splited[0]['school_simple'] = values_c_0.tolist() #done

../thesis_pdf_all/1.pdf--------------------------------------| 0.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/5539.pdf-----------------------------------| 0.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5573.pdf-----------------------------------| 0.8% Complete
escuela de quimica
../thesis_pdf_all/5572.pdf-----------------------------------| 1.0% Complete
escuela de quimica
../thesis_pdf_all/5571.pdf-----------------------------------| 1.3% Complete
escuela de quimica
../thesis_pdf_all/5570.pdf-----------------------------------| 1.5% Complete
escuela de biologia
../thesis_pdf_all/5569.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5568.pdf-----------------------------------| 2.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5567.pdf-----------------------------------| 2.3% Complete
escuela de computacion
../thesis_pdf_all/5566.pdf-----------------------------------| 2.6% Complete

escuela de geologia, minas y geofisica
../thesis_pdf_all/5488.pdf-----------------------------------| 21.0% Complete
../thesis_pdf_all/5487.pdf-----------------------------------| 21.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5486.pdf-----------------------------------| 21.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5485.pdf-----------------------------------| 21.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5484.pdf-----------------------------------| 22.1% Complete
../thesis_pdf_all/5483.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5482.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5481.pdf-----------------------------------| 22.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5480.pdf-----------------------------------| 23.1% Complete
escuela de geologia, minas y geofisica
../t

../thesis_pdf_all/5727.pdf█████------------------------------| 40.3% Complete
escuela de computacion
../thesis_pdf_all/5726.pdf█████------------------------------| 40.5% Complete
escuela de computacion
../thesis_pdf_all/5725.pdf█████------------------------------| 40.8% Complete
escuela de computacion
../thesis_pdf_all/5724.pdf█████------------------------------| 41.0% Complete
escuela de biologia
../thesis_pdf_all/5723.pdf█████------------------------------| 41.3% Complete
escuela de biologia
../thesis_pdf_all/5722.pdf█████------------------------------| 41.5% Complete
escuela de biologia
../thesis_pdf_all/5721.pdf█████------------------------------| 41.8% Complete
escuela de fisica
../thesis_pdf_all/5720.pdf██████-----------------------------| 42.1% Complete
escuela de fisica
../thesis_pdf_all/5719.pdf██████-----------------------------| 42.3% Complete
escuela de biologia
../thesis_pdf_all/5718.pdf██████-----------------------------| 42.6% Complete
escuela de biologia
../thesis_pdf_a

../thesis_pdf_all/5642.pdf███████████████--------------------| 61.3% Complete
escuela de quimica
../thesis_pdf_all/5641.pdf███████████████--------------------| 61.5% Complete
escuela de quimica
../thesis_pdf_all/5640.pdf███████████████--------------------| 61.8% Complete
escuela de quimica
../thesis_pdf_all/5639.pdf████████████████-------------------| 62.1% Complete
escuela de matematica
../thesis_pdf_all/5479.pdf████████████████-------------------| 62.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5477.pdf████████████████-------------------| 62.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5730.pdf████████████████-------------------| 62.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/5295.pdf████████████████-------------------| 63.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5323.pdf████████████████-------------------| 63.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5322.pdf████████████

escuela de quimica
../thesis_pdf_all/5249.pdf██████████████████████████---------| 82.3% Complete
escuela de quimica
../thesis_pdf_all/5248.pdf██████████████████████████---------| 82.6% Complete
escuela de quimica
../thesis_pdf_all/5247.pdf██████████████████████████---------| 82.8% Complete
escuela de quimica
../thesis_pdf_all/5246.pdf██████████████████████████---------| 83.1% Complete
escuela de quimica
../thesis_pdf_all/5245.pdf██████████████████████████---------| 83.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5244.pdf██████████████████████████---------| 83.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5243.pdf██████████████████████████---------| 83.8% Complete
../thesis_pdf_all/5242.pdf███████████████████████████--------| 84.1% Complete
escuela de matematica
../thesis_pdf_all/5241.pdf███████████████████████████--------| 84.4% Complete
escuela de matematica
../thesis_pdf_all/5240.pdf███████████████████████████--------| 84.6% Complete
esc

In [25]:
# type(values_c_0) 

In [26]:
df_splited[0].to_csv("./data/splitted/thesis_df_splited0_with_school.csv", index=False) #done

In [27]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[1])
values_c_1 = check_vec("../"+df_splited[1]["path"],l)
df_splited[1]['school_simple'] = values_c_1.tolist() #done

../thesis_pdf_all/5423.pdf-----------------------------------| 0.3% Complete
escuela de quimica
../thesis_pdf_all/5422.pdf-----------------------------------| 0.5% Complete
../thesis_pdf_all/5421.pdf-----------------------------------| 0.8% Complete
escuela de quimica
../thesis_pdf_all/5420.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5419.pdf-----------------------------------| 1.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5446.pdf-----------------------------------| 1.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5447.pdf-----------------------------------| 1.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5448.pdf-----------------------------------| 2.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5463.pdf-----------------------------------| 2.3% Complete
escuela de computacion
../thesis_pdf_all/5475.pdf----------------------------------

escuela de quimica
../thesis_pdf_all/5405.pdf-----------------------------------| 20.5% Complete
escuela de quimica
../thesis_pdf_all/5404.pdf-----------------------------------| 20.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5403.pdf-----------------------------------| 21.0% Complete
escuela de computacion
../thesis_pdf_all/5401.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5388.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5400.pdf-----------------------------------| 21.8% Complete
escuela de computacion
../thesis_pdf_all/5399.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/5398.pdf-----------------------------------| 22.3% Complete
escuela de computacion
../thesis_pdf_all/5397.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria quimica
../thesis

../thesis_pdf_all/6044.pdf█████------------------------------| 40.5% Complete
escuela de biologia
../thesis_pdf_all/6042.pdf█████------------------------------| 40.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/6105.pdf█████------------------------------| 41.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6041.pdf█████------------------------------| 41.3% Complete
escuela de quimica
../thesis_pdf_all/6010.pdf█████------------------------------| 41.5% Complete
escuela de computacion
../thesis_pdf_all/6009.pdf█████------------------------------| 41.8% Complete
escuela de computacion
../thesis_pdf_all/6008.pdf██████-----------------------------| 42.1% Complete
escuela de computacion
../thesis_pdf_all/6007.pdf██████-----------------------------| 42.3% Complete
escuela de computacion
../thesis_pdf_all/6006.pdf██████-----------------------------| 42.6% Complete
escuela de computacion
../thesis_pdf_all/6005.pdf██████-----------------------------| 

escuela de bibliotecologia y archivologia
../thesis_pdf_all/6178.pdf███████████████--------------------| 61.5% Complete
escuela de artes
../thesis_pdf_all/6177.pdf███████████████--------------------| 61.8% Complete
escuela de comunicacion social
../thesis_pdf_all/6176.pdf████████████████-------------------| 62.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6175.pdf████████████████-------------------| 62.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6174.pdf████████████████-------------------| 62.6% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6173.pdf████████████████-------------------| 62.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6172.pdf████████████████-------------------| 63.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6171.pdf████████████████-------------------| 63.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6170.pdf████████

escuela de bibliotecologia y archivologia
../thesis_pdf_all/6160.pdf█████████████████████████----------| 81.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6159.pdf██████████████████████████---------| 82.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6158.pdf██████████████████████████---------| 82.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6157.pdf██████████████████████████---------| 82.6% Complete
escuela de artes
../thesis_pdf_all/6156.pdf██████████████████████████---------| 82.8% Complete
escuela de artes
../thesis_pdf_all/6155.pdf██████████████████████████---------| 83.1% Complete
escuela de artes
../thesis_pdf_all/6154.pdf██████████████████████████---------| 83.3% Complete
escuela de artes
../thesis_pdf_all/6153.pdf██████████████████████████---------| 83.6% Complete
escuela de artes
../thesis_pdf_all/6151.pdf██████████████████████████---------| 83.8% Complete
escuela de artes
../thesis_pdf_all/6138.pdf██

In [28]:
# df_splited[0]['school_simple'] = values_new.tolist()

In [29]:
# type(df_splited[1])

In [30]:
# df_splited[1]['school_simple'] = values_c_1.tolist()

In [31]:
df_splited[1].to_csv("./data/splitted/thesis_df_splited1_with_school.csv", index=False) #done

In [32]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[2])
values_c_2 = check_vec("../"+df_splited[2]["path"],l)
df_splited[2]['school_simple'] = values_c_2.tolist() #done

../thesis_pdf_all/5839.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/5838.pdf-----------------------------------| 0.5% Complete
../thesis_pdf_all/5837.pdf-----------------------------------| 0.8% Complete
escuela de fisica
../thesis_pdf_all/5836.pdf-----------------------------------| 1.0% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5835.pdf-----------------------------------| 1.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5834.pdf-----------------------------------| 1.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5833.pdf-----------------------------------| 1.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5832.pdf-----------------------------------| 2.1% Complete
../thesis_pdf_all/5831.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/5830.pdf-----------------------------------| 2.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all

../thesis_pdf_all/5941.pdf-----------------------------------| 21.0% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5940.pdf-----------------------------------| 21.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5939.pdf-----------------------------------| 21.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5938.pdf-----------------------------------| 21.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5937.pdf-----------------------------------| 22.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5936.pdf-----------------------------------| 22.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5935.pdf-----------------------------------| 22.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5934.pdf-----------------------------------| 22.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5933.pdf-----------------------------------| 2

escuela de geologia, minas y geofisica
../thesis_pdf_all/5861.pdf█████------------------------------| 41.5% Complete
escuela de fisica
../thesis_pdf_all/5860.pdf█████------------------------------| 41.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5859.pdf██████-----------------------------| 42.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5886.pdf██████-----------------------------| 42.3% Complete
escuela de comunicacion social
../thesis_pdf_all/5887.pdf██████-----------------------------| 42.6% Complete
escuela de comunicacion social
../thesis_pdf_all/5888.pdf██████-----------------------------| 42.8% Complete
escuela de comunicacion social
../thesis_pdf_all/5903.pdf██████-----------------------------| 43.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5915.pdf██████-----------------------------| 43.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/5914.pdf██████-----------------------------| 43.6% Complete
escuela de

../thesis_pdf_all/4582.pdf████████████████-------------------| 62.3% Complete
escuela de enfermeria
../thesis_pdf_all/4594.pdf████████████████-------------------| 62.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4593.pdf████████████████-------------------| 62.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4592.pdf████████████████-------------------| 63.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4591.pdf████████████████-------------------| 63.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4590.pdf████████████████-------------------| 63.6% Complete
../thesis_pdf_all/4589.pdf████████████████-------------------| 63.8% Complete
escuela de enfermeria
../thesis_pdf_all/4588.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4587.pdf█████████████████------------------| 64.4% Complete
escuela de enfermeria
../thesis_pdf_all/4586.pdf█████████████████------------

escuela de ingenieria electrica
../thesis_pdf_all/4694.pdf███████████████████████████--------| 84.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4693.pdf███████████████████████████--------| 84.4% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4692.pdf███████████████████████████--------| 84.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4691.pdf███████████████████████████--------| 84.9% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4690.pdf███████████████████████████--------| 85.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4688.pdf███████████████████████████--------| 85.4% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4687.pdf███████████████████████████--------| 85.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4686.pdf███████████████████████████--------| 85.9% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4685.pdf████████████████████████████-------| 86.2% Complete
escuela de

In [33]:
#type(values_c_2)

In [34]:
df_splited[2].to_csv("./data/splitted/thesis_df_splited2_with_school.csv", index=False) #done

In [13]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[3])
values_c_3 = check_vec("../"+df_splited[3]["path"],l)
df_splited[3]['school_simple'] = values_c_3.tolist()

../thesis_pdf_all/4630.pdf-----------------------------------| 0.3% Complete
escuela de educacion
../thesis_pdf_all/4629.pdf-----------------------------------| 0.5% Complete
escuela de educacion
../thesis_pdf_all/4628.pdf-----------------------------------| 0.8% Complete
escuela de educacion
../thesis_pdf_all/4627.pdf-----------------------------------| 1.0% Complete
escuela de letras
../thesis_pdf_all/4626.pdf-----------------------------------| 1.3% Complete
escuela de educacion
../thesis_pdf_all/4625.pdf-----------------------------------| 1.5% Complete
escuela de educacion
../thesis_pdf_all/4624.pdf-----------------------------------| 1.8% Complete
escuela de educacion
../thesis_pdf_all/4623.pdf-----------------------------------| 2.1% Complete
escuela de educacion
../thesis_pdf_all/4622.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/4621.pdf-----------------------------------| 2.6% Complete
escuela de artes
../thesis_pdf_all/4620.pdf----------------------

escuela de ingenieria quimica
../thesis_pdf_all/4361.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4360.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/4359.pdf-----------------------------------| 22.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4358.pdf-----------------------------------| 23.1% Complete
../thesis_pdf_all/4357.pdf-----------------------------------| 23.3% Complete
escuela de quimica
../thesis_pdf_all/4356.pdf-----------------------------------| 23.6% Complete
../thesis_pdf_all/4355.pdf-----------------------------------| 23.8% Complete
escuela de quimica
../thesis_pdf_all/4354.pdf-----------------------------------| 24.1% Complete
escuela de quimica
../thesis_pdf_all/4353.pdf-----------------------------------| 24.4% Complete
escuela de quimica
../thesis_pdf_all/4351.pdf-----------------------------------| 24.6% Complete
escuela de fisica

escuela de computacion
../thesis_pdf_all/4277.pdf██████-----------------------------| 43.6% Complete
escuela de computacion
../thesis_pdf_all/4365.pdf██████-----------------------------| 43.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4367.pdf███████----------------------------| 44.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4487.pdf███████----------------------------| 44.4% Complete
escuela de computacion
../thesis_pdf_all/4428.pdf███████----------------------------| 44.6% Complete
escuela de quimica
../thesis_pdf_all/4456.pdf███████----------------------------| 44.9% Complete
escuela de ingenieria civil
../thesis_pdf_all/4455.pdf███████----------------------------| 45.1% Complete
escuela de quimica
../thesis_pdf_all/4454.pdf███████----------------------------| 45.4% Complete
escuela de enfermeria
../thesis_pdf_all/4453.pdf███████----------------------------| 45.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/4452.pdf███████--------------------

escuela de ingenieria civil
../thesis_pdf_all/4379.pdf█████████████████------------------| 64.6% Complete
escuela de idiomas modernos
../thesis_pdf_all/4378.pdf█████████████████------------------| 64.9% Complete
escuela de ingenieria civil
../thesis_pdf_all/4377.pdf█████████████████------------------| 65.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/4376.pdf█████████████████------------------| 65.4% Complete
escuela de ingenieria civil
../thesis_pdf_all/4375.pdf█████████████████------------------| 65.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/4374.pdf█████████████████------------------| 65.9% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4373.pdf██████████████████-----------------| 66.2% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4372.pdf██████████████████-----------------| 66.4% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4371.pdf██████████████████-----------------| 66.7% Complete
escuela de ingenieria quimica
../thesis_

../thesis_pdf_all/5106.pdf███████████████████████████--------| 84.6% Complete
escuela de biologia
../thesis_pdf_all/5105.pdf███████████████████████████--------| 84.9% Complete
escuela de biologia
../thesis_pdf_all/5104.pdf███████████████████████████--------| 85.1% Complete
escuela de biologia
../thesis_pdf_all/5103.pdf███████████████████████████--------| 85.4% Complete
escuela de biologia
../thesis_pdf_all/5102.pdf███████████████████████████--------| 85.6% Complete
../thesis_pdf_all/5101.pdf███████████████████████████--------| 85.9% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5100.pdf████████████████████████████-------| 86.2% Complete
../thesis_pdf_all/5099.pdf████████████████████████████-------| 86.4% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5098.pdf████████████████████████████-------| 86.7% Complete
../thesis_pdf_all/5097.pdf████████████████████████████-------| 86.9% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5095.

In [14]:
df_splited[3].to_csv("./data/splitted/thesis_df_splited3_with_school.csv", index=False)

In [37]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[4])
values_c_4 = check_vec("../"+df_splited[4]["path"],l)
df_splited[4]['school_simple'] = values_c_4.tolist() #done

../thesis_pdf_all/5045.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/5044.pdf-----------------------------------| 0.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5043.pdf-----------------------------------| 0.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5042.pdf-----------------------------------| 1.0% Complete
../thesis_pdf_all/5041.pdf-----------------------------------| 1.3% Complete
escuela de quimica
../thesis_pdf_all/5040.pdf-----------------------------------| 1.5% Complete
escuela de quimica
../thesis_pdf_all/5039.pdf-----------------------------------| 1.8% Complete
escuela de quimica
../thesis_pdf_all/5038.pdf-----------------------------------| 2.1% Complete
escuela de quimica
../thesis_pdf_all/5037.pdf-----------------------------------| 2.3% Complete
escuela de quimica
../thesis_pdf_all/5036.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/5034.pdf------------

escuela de quimica
../thesis_pdf_all/5174.pdf-----------------------------------| 22.1% Complete
escuela de computacion
../thesis_pdf_all/5172.pdf-----------------------------------| 22.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5112.pdf-----------------------------------| 22.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5171.pdf-----------------------------------| 22.8% Complete
escuela de fisica
../thesis_pdf_all/5140.pdf-----------------------------------| 23.1% Complete
escuela de fisica
../thesis_pdf_all/5139.pdf-----------------------------------| 23.3% Complete
../thesis_pdf_all/5138.pdf-----------------------------------| 23.6% Complete
escuela de biologia
../thesis_pdf_all/5137.pdf-----------------------------------| 23.8% Complete
escuela de biologia
../thesis_pdf_all/5136.pdf-----------------------------------| 24.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/5135.pdf-----------------------------------| 

escuela de geologia, minas y geofisica
../thesis_pdf_all/4816.pdf██████-----------------------------| 42.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4815.pdf██████-----------------------------| 42.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4814.pdf██████-----------------------------| 42.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4813.pdf██████-----------------------------| 42.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4812.pdf██████-----------------------------| 43.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4811.pdf██████-----------------------------| 43.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4810.pdf██████-----------------------------| 43.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4809.pdf██████-----------------------------| 43.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4808.pd

escuela de geologia, minas y geofisica
../thesis_pdf_all/4798.pdf████████████████-------------------| 62.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4797.pdf████████████████-------------------| 62.6% Complete
../thesis_pdf_all/4796.pdf████████████████-------------------| 62.8% Complete
../thesis_pdf_all/4795.pdf████████████████-------------------| 63.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4794.pdf████████████████-------------------| 63.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4793.pdf████████████████-------------------| 63.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4791.pdf████████████████-------------------| 63.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4790.pdf█████████████████------------------| 64.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4789.pdf█████████████████------------------| 64.4% Complete
escuela de geologia, mina

../thesis_pdf_all/4965.pdf██████████████████████████---------| 82.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4964.pdf██████████████████████████---------| 82.6% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4963.pdf██████████████████████████---------| 82.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4962.pdf██████████████████████████---------| 83.1% Complete
escuela de quimica
../thesis_pdf_all/4961.pdf██████████████████████████---------| 83.3% Complete
escuela de quimica
../thesis_pdf_all/4960.pdf██████████████████████████---------| 83.6% Complete
escuela de fisica
../thesis_pdf_all/4959.pdf██████████████████████████---------| 83.8% Complete
escuela de fisica
../thesis_pdf_all/4958.pdf███████████████████████████--------| 84.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4957.pdf███████████████████████████--------| 84.4% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/4924.pdf███

In [38]:
df_splited[4].to_csv("./data/splitted/thesis_df_splited4_with_school.csv", index=False) #done

In [39]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[5])
values_c_5 = check_vec("../"+df_splited[5]["path"],l)
df_splited[5]['school_simple'] = values_c_5.tolist()

../thesis_pdf_all/6228.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/6229.pdf-----------------------------------| 0.5% Complete
../thesis_pdf_all/7521.pdf-----------------------------------| 0.8% Complete
escuela de biologia
../thesis_pdf_all/7551.pdf-----------------------------------| 1.0% Complete
escuela de idiomas modernos
../thesis_pdf_all/7550.pdf-----------------------------------| 1.3% Complete
escuela de idiomas modernos
../thesis_pdf_all/7549.pdf-----------------------------------| 1.5% Complete
escuela de idiomas modernos
../thesis_pdf_all/7548.pdf-----------------------------------| 1.8% Complete
escuela de idiomas modernos
../thesis_pdf_all/7547.pdf-----------------------------------| 2.1% Complete
escuela de quimica
../thesis_pdf_all/7546.pdf-----------------------------------| 2.3% Complete
escuela de quimica
../thesis_pdf_all/7545.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/7544.pdf---------------

escuela de biologia
../thesis_pdf_all/7465.pdf-----------------------------------| 22.1% Complete
../thesis_pdf_all/7464.pdf-----------------------------------| 22.3% Complete
escuela de biologia
../thesis_pdf_all/7463.pdf-----------------------------------| 22.6% Complete
escuela de biologia
../thesis_pdf_all/7462.pdf-----------------------------------| 22.8% Complete
escuela de biologia
../thesis_pdf_all/7461.pdf-----------------------------------| 23.1% Complete
escuela de quimica
../thesis_pdf_all/7460.pdf-----------------------------------| 23.3% Complete
escuela de biologia
../thesis_pdf_all/7489.pdf-----------------------------------| 23.6% Complete
escuela de quimica
../thesis_pdf_all/7490.pdf-----------------------------------| 23.8% Complete
escuela de quimica
../thesis_pdf_all/7491.pdf-----------------------------------| 24.1% Complete
escuela de quimica
../thesis_pdf_all/7506.pdf-----------------------------------| 24.4% Complete
escuela de computacion
../thesis_pdf_all/751

escuela de geografia
../thesis_pdf_all/7702.pdf██████-----------------------------| 42.6% Complete
escuela de geografia
../thesis_pdf_all/7701.pdf██████-----------------------------| 42.8% Complete
escuela de geografia
../thesis_pdf_all/7700.pdf██████-----------------------------| 43.1% Complete
escuela de geografia
../thesis_pdf_all/7698.pdf██████-----------------------------| 43.3% Complete
escuela de geografia
../thesis_pdf_all/7685.pdf██████-----------------------------| 43.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/7697.pdf██████-----------------------------| 43.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7696.pdf███████----------------------------| 44.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7695.pdf███████----------------------------| 44.4% Complete
../thesis_pdf_all/7694.pdf███████----------------------------| 44.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/7693.pdf███████----------------------------

../thesis_pdf_all/7271.pdf████████████████-------------------| 63.3% Complete
escuela de psicologia
../thesis_pdf_all/7299.pdf████████████████-------------------| 63.6% Complete
escuela de psicologia
../thesis_pdf_all/7298.pdf████████████████-------------------| 63.8% Complete
escuela de psicologia
../thesis_pdf_all/7297.pdf█████████████████------------------| 64.1% Complete
escuela de psicologia
../thesis_pdf_all/7296.pdf█████████████████------------------| 64.4% Complete
escuela de psicologia
../thesis_pdf_all/7295.pdf█████████████████------------------| 64.6% Complete
escuela de psicologia
../thesis_pdf_all/7294.pdf█████████████████------------------| 64.9% Complete
escuela de educacion
../thesis_pdf_all/7293.pdf█████████████████------------------| 65.1% Complete
escuela de artes
../thesis_pdf_all/7292.pdf█████████████████------------------| 65.4% Complete
escuela de artes
../thesis_pdf_all/7291.pdf█████████████████------------------| 65.6% Complete
escuela de artes
../thesis_pdf_al

escuela de psicologia
../thesis_pdf_all/7218.pdf███████████████████████████--------| 84.4% Complete
escuela de psicologia
../thesis_pdf_all/7217.pdf███████████████████████████--------| 84.6% Complete
escuela de psicologia
../thesis_pdf_all/7216.pdf███████████████████████████--------| 84.9% Complete
escuela de psicologia
../thesis_pdf_all/7215.pdf███████████████████████████--------| 85.1% Complete
escuela de psicologia
../thesis_pdf_all/7214.pdf███████████████████████████--------| 85.4% Complete
escuela de psicologia
../thesis_pdf_all/7213.pdf███████████████████████████--------| 85.6% Complete
escuela de psicologia
../thesis_pdf_all/7212.pdf███████████████████████████--------| 85.9% Complete
escuela de psicologia
../thesis_pdf_all/7239.pdf████████████████████████████-------| 86.2% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/7240.pdf████████████████████████████-------| 86.4% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/7241.pdf██████████████████

In [40]:
#guarda las escuelas clasificadas en un csv
df_splited[5].to_csv("./data/splitted/thesis_df_splited5_with_school.csv", index=False)

In [41]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[6])
values_c_6 = check_vec("../"+df_splited[6]["path"],l)
df_splited[6]['school_simple'] = values_c_6.tolist()

../thesis_pdf_all/7402.pdf-----------------------------------| 0.3% Complete
escuela de psicologia
../thesis_pdf_all/7401.pdf-----------------------------------| 0.5% Complete
escuela de psicologia
../thesis_pdf_all/7400.pdf-----------------------------------| 0.8% Complete
escuela de psicologia
../thesis_pdf_all/7399.pdf-----------------------------------| 1.0% Complete
escuela de psicologia
../thesis_pdf_all/7398.pdf-----------------------------------| 1.3% Complete
escuela de psicologia
../thesis_pdf_all/7397.pdf-----------------------------------| 1.5% Complete
escuela de psicologia
../thesis_pdf_all/7424.pdf-----------------------------------| 1.8% Complete
escuela de computacion
../thesis_pdf_all/7425.pdf-----------------------------------| 2.1% Complete
escuela de computacion
../thesis_pdf_all/7426.pdf-----------------------------------| 2.3% Complete
escuela de computacion
../thesis_pdf_all/7443.pdf-----------------------------------| 2.6% Complete
escuela de biologia
../thesis

../thesis_pdf_all/7366.pdf-----------------------------------| 21.8% Complete
escuela de psicologia
../thesis_pdf_all/7378.pdf-----------------------------------| 22.1% Complete
../thesis_pdf_all/7377.pdf-----------------------------------| 22.3% Complete
escuela de psicologia
../thesis_pdf_all/7376.pdf-----------------------------------| 22.6% Complete
escuela de psicologia
../thesis_pdf_all/7375.pdf-----------------------------------| 22.8% Complete
escuela de psicologia
../thesis_pdf_all/7374.pdf-----------------------------------| 23.1% Complete
escuela de psicologia
../thesis_pdf_all/7373.pdf-----------------------------------| 23.3% Complete
escuela de psicologia
../thesis_pdf_all/7372.pdf-----------------------------------| 23.6% Complete
escuela de psicologia
../thesis_pdf_all/7371.pdf-----------------------------------| 23.8% Complete
escuela de psicologia
../thesis_pdf_all/7370.pdf-----------------------------------| 24.1% Complete
escuela de psicologia
../thesis_pdf_all/7369

escuela de trabajo social
../thesis_pdf_all/7991.pdf██████-----------------------------| 43.1% Complete
escuela de trabajo social
../thesis_pdf_all/7990.pdf██████-----------------------------| 43.3% Complete
escuela de trabajo social
../thesis_pdf_all/7989.pdf██████-----------------------------| 43.6% Complete
escuela de trabajo social
../thesis_pdf_all/7988.pdf██████-----------------------------| 43.8% Complete
escuela de trabajo social
../thesis_pdf_all/7987.pdf███████----------------------------| 44.1% Complete
escuela de trabajo social
../thesis_pdf_all/7986.pdf███████----------------------------| 44.4% Complete
escuela de biologia
../thesis_pdf_all/7985.pdf███████----------------------------| 44.6% Complete
escuela de biologia
../thesis_pdf_all/7984.pdf███████----------------------------| 44.9% Complete
escuela de biologia
../thesis_pdf_all/7983.pdf███████----------------------------| 45.1% Complete
escuela de biologia
../thesis_pdf_all/7982.pdf███████----------------------------|

../thesis_pdf_all/8182.pdf█████████████████------------------| 64.6% Complete
../thesis_pdf_all/8183.pdf█████████████████------------------| 64.9% Complete
../thesis_pdf_all/8198.pdf█████████████████------------------| 65.1% Complete
escuela de artes
../thesis_pdf_all/8210.pdf█████████████████------------------| 65.4% Complete
escuela de sociologia
../thesis_pdf_all/8209.pdf█████████████████------------------| 65.6% Complete
escuela de sociologia
../thesis_pdf_all/8208.pdf█████████████████------------------| 65.9% Complete
escuela de sociologia
../thesis_pdf_all/8207.pdf██████████████████-----------------| 66.2% Complete
../thesis_pdf_all/8206.pdf██████████████████-----------------| 66.4% Complete
../thesis_pdf_all/8205.pdf██████████████████-----------------| 66.7% Complete
escuela de agronomia
../thesis_pdf_all/8204.pdf██████████████████-----------------| 66.9% Complete
../thesis_pdf_all/8203.pdf██████████████████-----------------| 67.2% Complete
../thesis_pdf_all/8202.pdf████████████

../thesis_pdf_all/8125.pdf████████████████████████████-------| 87.2% Complete
escuela de ingenieria electrica
../thesis_pdf_all/8124.pdf████████████████████████████-------| 87.4% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7969.pdf████████████████████████████-------| 87.7% Complete
escuela de trabajo social
../thesis_pdf_all/7968.pdf████████████████████████████-------| 87.9% Complete
escuela de biologia
../thesis_pdf_all/7967.pdf█████████████████████████████------| 88.2% Complete
escuela de biologia
../thesis_pdf_all/7776.pdf█████████████████████████████------| 88.5% Complete
escuela de psicologia
../thesis_pdf_all/7811.pdf█████████████████████████████------| 88.7% Complete
escuela de psicologia
../thesis_pdf_all/7810.pdf█████████████████████████████------| 89.0% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7809.pdf█████████████████████████████------| 89.2% Complete
escuela de psicologia
../thesis_pdf_all/7808.pdf█████████████████████████████------| 89.5% C

In [42]:
df_splited[6].to_csv("./data/splitted/thesis_df_splited6_with_school.csv", index=False)

In [43]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[7])
values_c_7 = check_vec("../"+df_splited[7]["path"],l)
df_splited[7]['school_simple'] = values_c_7.tolist()

../thesis_pdf_all/7827.pdf-----------------------------------| 0.3% Complete
escuela de psicologia
../thesis_pdf_all/7826.pdf-----------------------------------| 0.5% Complete
escuela de psicologia
../thesis_pdf_all/7825.pdf-----------------------------------| 0.8% Complete
escuela de psicologia
../thesis_pdf_all/7824.pdf-----------------------------------| 1.0% Complete
escuela de psicologia
../thesis_pdf_all/7823.pdf-----------------------------------| 1.3% Complete
escuela de psicologia
../thesis_pdf_all/7822.pdf-----------------------------------| 1.5% Complete
escuela de psicologia
../thesis_pdf_all/7821.pdf-----------------------------------| 1.8% Complete
escuela de psicologia
../thesis_pdf_all/7820.pdf-----------------------------------| 2.1% Complete
escuela de psicologia
../thesis_pdf_all/7819.pdf-----------------------------------| 2.3% Complete
escuela de psicologia
../thesis_pdf_all/7818.pdf-----------------------------------| 2.6% Complete
escuela de psicologia
../thesis_

escuela de trabajo social
../thesis_pdf_all/7928.pdf-----------------------------------| 21.3% Complete
escuela de trabajo social
../thesis_pdf_all/7927.pdf-----------------------------------| 21.5% Complete
escuela de trabajo social
../thesis_pdf_all/7926.pdf-----------------------------------| 21.8% Complete
escuela de trabajo social
../thesis_pdf_all/7925.pdf-----------------------------------| 22.1% Complete
escuela de trabajo social
../thesis_pdf_all/7924.pdf-----------------------------------| 22.3% Complete
escuela de trabajo social
../thesis_pdf_all/7923.pdf-----------------------------------| 22.6% Complete
escuela de trabajo social
../thesis_pdf_all/7922.pdf-----------------------------------| 22.8% Complete
escuela de trabajo social
../thesis_pdf_all/7921.pdf-----------------------------------| 23.1% Complete
escuela de trabajo social
../thesis_pdf_all/7920.pdf-----------------------------------| 23.3% Complete
escuela de trabajo social
../thesis_pdf_all/7919.pdf------------

escuela de trabajo social
../thesis_pdf_all/7846.pdf██████-----------------------------| 42.1% Complete
escuela de comunicacion social
../thesis_pdf_all/7872.pdf██████-----------------------------| 42.3% Complete
escuela de trabajo social
../thesis_pdf_all/7874.pdf██████-----------------------------| 42.6% Complete
escuela de trabajo social
../thesis_pdf_all/7903.pdf██████-----------------------------| 42.8% Complete
escuela de trabajo social
../thesis_pdf_all/7889.pdf██████-----------------------------| 43.1% Complete
escuela de trabajo social
../thesis_pdf_all/7902.pdf██████-----------------------------| 43.3% Complete
escuela de trabajo social
../thesis_pdf_all/7901.pdf██████-----------------------------| 43.6% Complete
escuela de trabajo social
../thesis_pdf_all/7900.pdf██████-----------------------------| 43.8% Complete
escuela de trabajo social
../thesis_pdf_all/7899.pdf███████----------------------------| 44.1% Complete
escuela de trabajo social
../thesis_pdf_all/7898.pdf███████

../thesis_pdf_all/6580.pdf████████████████-------------------| 62.8% Complete
escuela de computacion
../thesis_pdf_all/6579.pdf████████████████-------------------| 63.1% Complete
escuela de computacion
../thesis_pdf_all/6578.pdf████████████████-------------------| 63.3% Complete
escuela de computacion
../thesis_pdf_all/6577.pdf████████████████-------------------| 63.6% Complete
escuela de computacion
../thesis_pdf_all/6576.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6575.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6574.pdf█████████████████------------------| 64.4% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6573.pdf█████████████████------------------| 64.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6572.pdf█████████████████------------------| 64.9% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6571.pdf█████████████████---

../thesis_pdf_all/6687.pdf██████████████████████████---------| 82.3% Complete
escuela de geografia
../thesis_pdf_all/6686.pdf██████████████████████████---------| 82.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/6685.pdf██████████████████████████---------| 82.8% Complete
escuela de educacion
../thesis_pdf_all/6684.pdf██████████████████████████---------| 83.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6683.pdf██████████████████████████---------| 83.3% Complete
escuela de educacion
../thesis_pdf_all/6682.pdf██████████████████████████---------| 83.6% Complete
escuela de educacion
../thesis_pdf_all/6681.pdf██████████████████████████---------| 83.8% Complete
escuela de educacion
../thesis_pdf_all/6680.pdf███████████████████████████--------| 84.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6679.pdf███████████████████████████--------| 84.4% Complete
escuela de educacion
../thesis_pdf_all/6678.pdf███████████████████████████--------| 84.6% Compl

In [44]:
df_splited[7].to_csv("./data/splitted/thesis_df_splited7_with_school.csv", index=False)

In [45]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[8])
values_c_8 = check_vec("../"+df_splited[8]["path"],l)
df_splited[8]['school_simple'] = values_c_8.tolist()

../thesis_pdf_all/6617.pdf-----------------------------------| 0.3% Complete
escuela de enfermeria
../thesis_pdf_all/6616.pdf-----------------------------------| 0.5% Complete
../thesis_pdf_all/6615.pdf-----------------------------------| 0.8% Complete
escuela de enfermeria
../thesis_pdf_all/6614.pdf-----------------------------------| 1.0% Complete
escuela de enfermeria
../thesis_pdf_all/6613.pdf-----------------------------------| 1.3% Complete
escuela de enfermeria
../thesis_pdf_all/6612.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6611.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6610.pdf-----------------------------------| 2.1% Complete
escuela de enfermeria
../thesis_pdf_all/6609.pdf-----------------------------------| 2.3% Complete
escuela de enfermeria
../thesis_pdf_all/6608.pdf-----------------------------------| 2.6% Complete
escuela de enfermeria
../thesis_pdf_

../thesis_pdf_all/6319.pdf-----------------------------------| 20.5% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6320.pdf-----------------------------------| 20.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/6321.pdf-----------------------------------| 21.0% Complete
../thesis_pdf_all/6337.pdf-----------------------------------| 21.3% Complete
escuela de educacion
../thesis_pdf_all/6350.pdf-----------------------------------| 21.5% Complete
../thesis_pdf_all/6349.pdf-----------------------------------| 21.8% Complete
escuela de comunicacion social
../thesis_pdf_all/6347.pdf-----------------------------------| 22.1% Complete
escuela de educacion
../thesis_pdf_all/6346.pdf-----------------------------------| 22.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6345.pdf-----------------------------------| 22.6% Complete
../thesis_pdf_all/6344.pdf-----------------------------------| 22.8% Complete
../thesis_pdf_all/6343.pdf----------

escuela de educacion
../thesis_pdf_all/6273.pdf█████------------------------------| 40.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6272.pdf█████------------------------------| 41.0% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6271.pdf█████------------------------------| 41.3% Complete
escuela de educacion
../thesis_pdf_all/6270.pdf█████------------------------------| 41.5% Complete
escuela de educacion
../thesis_pdf_all/6269.pdf█████------------------------------| 41.8% Complete
escuela de educacion
../thesis_pdf_all/6268.pdf██████-----------------------------| 42.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6267.pdf██████-----------------------------| 42.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/6266.pdf██████-----------------------------| 42.6% Complete
escuela de educacion
../thesis_pdf_all/6265.pdf██████-----------------------------| 42.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6264.pdf██████--

../thesis_pdf_all/6412.pdf███████████████--------------------| 60.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6381.pdf███████████████--------------------| 60.5% Complete
escuela de educacion
../thesis_pdf_all/6380.pdf███████████████--------------------| 60.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6379.pdf███████████████--------------------| 61.0% Complete
escuela de educacion
../thesis_pdf_all/6378.pdf███████████████--------------------| 61.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6377.pdf███████████████--------------------| 61.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6376.pdf███████████████--------------------| 61.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6375.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6374.pdf████████████████-------------------| 62.3% Complete
escuela de educacion
../thes

escuela de bibliotecologia y archivologia
../thesis_pdf_all/7040.pdf████████████████████████-----------| 79.7% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7039.pdf█████████████████████████----------| 80.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7038.pdf█████████████████████████----------| 80.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7037.pdf█████████████████████████----------| 80.5% Complete
escuela de educacion
../thesis_pdf_all/7036.pdf█████████████████████████----------| 80.8% Complete
escuela de educacion
../thesis_pdf_all/7035.pdf█████████████████████████----------| 81.0% Complete
escuela de artes
../thesis_pdf_all/7034.pdf█████████████████████████----------| 81.3% Complete
escuela de artes
../thesis_pdf_all/7033.pdf█████████████████████████----------| 81.5% Complete
escuela de artes
../thesis_pdf_all/7032.pdf█████████████████████████----------| 81.8% Complete
escuela de artes
../thesis_pdf_all/70

escuela de ingenieria electrica
../thesis_pdf_all/7023.pdf██████████████████████████████████-| 99.7% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7022.pdf-----------------------------------| 0.0% Complete
escuela de bibliotecologia y archivologia


In [46]:
df_splited[8].to_csv("./data/splitted/thesis_df_splited8_with_school.csv", index=False)

In [47]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[9])
values_c_9 = check_vec("../"+df_splited[9]["path"],l)
df_splited[9]['school_simple'] = values_c_9.tolist()

../thesis_pdf_all/7021.pdf-----------------------------------| 0.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7020.pdf-----------------------------------| 0.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7019.pdf-----------------------------------| 0.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7018.pdf-----------------------------------| 1.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7017.pdf-----------------------------------| 1.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7016.pdf-----------------------------------| 1.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7015.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7014.pdf-----------------------------------| 2.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7013.pdf--------------------------------

../thesis_pdf_all/7187.pdf-----------------------------------| 20.3% Complete
escuela de psicologia
../thesis_pdf_all/7186.pdf-----------------------------------| 20.5% Complete
escuela de psicologia
../thesis_pdf_all/7185.pdf-----------------------------------| 20.8% Complete
escuela de psicologia
../thesis_pdf_all/7184.pdf-----------------------------------| 21.0% Complete
escuela de psicologia
../thesis_pdf_all/7183.pdf-----------------------------------| 21.3% Complete
escuela de trabajo social
../thesis_pdf_all/7182.pdf-----------------------------------| 21.5% Complete
escuela de psicologia
../thesis_pdf_all/7181.pdf-----------------------------------| 21.8% Complete
escuela de psicologia
../thesis_pdf_all/7149.pdf-----------------------------------| 22.1% Complete
escuela de psicologia
../thesis_pdf_all/7147.pdf-----------------------------------| 22.3% Complete
escuela de psicologia
../thesis_pdf_all/7088.pdf-----------------------------------| 22.6% Complete
escuela de psicolo

escuela de educacion
../thesis_pdf_all/6801.pdf█████------------------------------| 41.0% Complete
escuela de educacion
../thesis_pdf_all/6800.pdf█████------------------------------| 41.3% Complete
escuela de educacion
../thesis_pdf_all/6799.pdf█████------------------------------| 41.5% Complete
escuela de educacion
../thesis_pdf_all/6798.pdf█████------------------------------| 41.8% Complete
escuela de educacion
../thesis_pdf_all/6797.pdf██████-----------------------------| 42.1% Complete
escuela de educacion
../thesis_pdf_all/6796.pdf██████-----------------------------| 42.3% Complete
escuela de educacion
../thesis_pdf_all/6795.pdf██████-----------------------------| 42.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6794.pdf██████-----------------------------| 42.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6793.pdf██████-----------------------------| 43.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6792.pdf██████--------------------------

escuela de educacion
../thesis_pdf_all/6767.pdf████████████████-------------------| 62.1% Complete
escuela de educacion
../thesis_pdf_all/6779.pdf████████████████-------------------| 62.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6778.pdf████████████████-------------------| 62.6% Complete
escuela de educacion
../thesis_pdf_all/6777.pdf████████████████-------------------| 62.8% Complete
escuela de educacion
../thesis_pdf_all/6776.pdf████████████████-------------------| 63.1% Complete
escuela de educacion
../thesis_pdf_all/6775.pdf████████████████-------------------| 63.3% Complete
escuela de educacion
../thesis_pdf_all/6774.pdf████████████████-------------------| 63.6% Complete
escuela de educacion
../thesis_pdf_all/6773.pdf████████████████-------------------| 63.8% Complete
escuela de educacion
../thesis_pdf_all/6772.pdf█████████████████------------------| 64.1% Complete
escuela de educacion
../thesis_pdf_all/6771.pdf█████████████████------------------| 64.4% Complete


../thesis_pdf_all/6944.pdf██████████████████████████---------| 82.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6943.pdf██████████████████████████---------| 82.6% Complete
../thesis_pdf_all/6942.pdf██████████████████████████---------| 82.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6941.pdf██████████████████████████---------| 83.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6940.pdf██████████████████████████---------| 83.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6939.pdf██████████████████████████---------| 83.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6938.pdf██████████████████████████---------| 83.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6937.pdf███████████████████████████--------| 84.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6936.pdf███████████████████████████--------| 84.4% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6904.pdf████████████████████████

In [48]:
df_splited[9].to_csv("./data/splitted/thesis_df_splited9_with_school.csv", index=False)

In [49]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[10])
values_c_10 = check_vec("../"+df_splited[10]["path"],l)
df_splited[10]['school_simple'] = values_c_10.tolist()

../thesis_pdf_all/4244.pdf-----------------------------------| 0.3% Complete
escuela de computacion
../thesis_pdf_all/4243.pdf-----------------------------------| 0.5% Complete
escuela de computacion
../thesis_pdf_all/1598.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1627.pdf-----------------------------------| 1.0% Complete
escuela de comunicacion social
../thesis_pdf_all/1626.pdf-----------------------------------| 1.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1625.pdf-----------------------------------| 1.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1624.pdf-----------------------------------| 1.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1623.pdf-----------------------------------| 2.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1622.pdf-----------------------------------| 2.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1621.pdf---------------

escuela de enfermeria
../thesis_pdf_all/1551.pdf-----------------------------------| 20.3% Complete
escuela de enfermeria
../thesis_pdf_all/1550.pdf-----------------------------------| 20.5% Complete
../thesis_pdf_all/1549.pdf-----------------------------------| 20.8% Complete
escuela de enfermeria
../thesis_pdf_all/1548.pdf-----------------------------------| 21.0% Complete
../thesis_pdf_all/1547.pdf-----------------------------------| 21.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1546.pdf-----------------------------------| 21.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1544.pdf-----------------------------------| 21.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1543.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1542.pdf-----------------------------------| 22.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1541.pdf-----------------------------------| 22.6% Complete


escuela de ingenieria mecanica
../thesis_pdf_all/1779.pdf█████------------------------------| 40.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1778.pdf█████------------------------------| 40.8% Complete
escuela de enfermeria
../thesis_pdf_all/1777.pdf█████------------------------------| 41.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1776.pdf█████------------------------------| 41.3% Complete
escuela de enfermeria
../thesis_pdf_all/1775.pdf█████------------------------------| 41.5% Complete
escuela de enfermeria
../thesis_pdf_all/1774.pdf█████------------------------------| 41.8% Complete
escuela de enfermeria
../thesis_pdf_all/1773.pdf██████-----------------------------| 42.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1772.pdf██████-----------------------------| 42.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1771.pdf██████-----------------------------| 42.6% Complete
escuela de enfermeria
../thesis_pdf_all/1770.p

escuela de educacion
../thesis_pdf_all/1698.pdf███████████████--------------------| 61.0% Complete
escuela de educacion
../thesis_pdf_all/1697.pdf███████████████--------------------| 61.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1696.pdf███████████████--------------------| 61.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1695.pdf███████████████--------------------| 61.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1694.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1693.pdf████████████████-------------------| 62.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1537.pdf████████████████-------------------| 62.6% Complete
escuela de comunicacion social
../thesis_pdf_all/1535.pdf████████████████-------------------| 62.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1291.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria m

../thesis_pdf_all/1313.pdf█████████████████████████----------| 80.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1312.pdf█████████████████████████----------| 81.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1311.pdf█████████████████████████----------| 81.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/1310.pdf█████████████████████████----------| 81.5% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1309.pdf█████████████████████████----------| 81.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1308.pdf██████████████████████████---------| 82.1% Complete
../thesis_pdf_all/1307.pdf██████████████████████████---------| 82.3% Complete
../thesis_pdf_all/1306.pdf██████████████████████████---------| 82.6% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1305.pdf██████████████████████████---------| 82.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1304.pdf██████████████████████████---------| 83.1

In [50]:
df_splited[10].to_csv("./data/splitted/thesis_df_splited10_with_school.csv", index=False)

In [20]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[11])
values_c_11 = check_vec("../"+df_splited[11]["path"],l)
df_splited[11]['school_simple'] = values_c_11.tolist()

../thesis_pdf_all/1482.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/1481.pdf-----------------------------------| 0.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1480.pdf-----------------------------------| 0.8% Complete
../thesis_pdf_all/1479.pdf-----------------------------------| 1.0% Complete
../thesis_pdf_all/1478.pdf-----------------------------------| 1.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/1477.pdf-----------------------------------| 1.5% Complete
../thesis_pdf_all/1504.pdf-----------------------------------| 1.8% Complete
escuela de enfermeria
../thesis_pdf_all/1505.pdf-----------------------------------| 2.1% Complete
escuela de enfermeria
../thesis_pdf_all/1506.pdf-----------------------------------| 2.3% Complete
escuela de enfermeria
../thesis_pdf_all/1521.pdf-----------------------------------| 2.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1533.pdf-------------------------------

escuela de comunicacion social
../thesis_pdf_all/1463.pdf-----------------------------------| 20.8% Complete
../thesis_pdf_all/1462.pdf-----------------------------------| 21.0% Complete
escuela de comunicacion social
../thesis_pdf_all/1461.pdf-----------------------------------| 21.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1459.pdf-----------------------------------| 21.5% Complete
escuela de enfermeria
../thesis_pdf_all/1446.pdf-----------------------------------| 21.8% Complete
escuela de enfermeria
../thesis_pdf_all/1458.pdf-----------------------------------| 22.1% Complete
escuela de enfermeria
../thesis_pdf_all/1457.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1456.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1455.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1454.pdf--------------------

../thesis_pdf_all/2057.pdf█████------------------------------| 41.8% Complete
escuela de educacion
../thesis_pdf_all/2056.pdf██████-----------------------------| 42.1% Complete
escuela de educacion
../thesis_pdf_all/2055.pdf██████-----------------------------| 42.3% Complete
escuela de educacion
../thesis_pdf_all/2054.pdf██████-----------------------------| 42.6% Complete
escuela de educacion
../thesis_pdf_all/2053.pdf██████-----------------------------| 42.8% Complete
escuela de artes
../thesis_pdf_all/2052.pdf██████-----------------------------| 43.1% Complete
escuela de educacion
../thesis_pdf_all/2051.pdf██████-----------------------------| 43.3% Complete
escuela de educacion
../thesis_pdf_all/2050.pdf██████-----------------------------| 43.6% Complete
../thesis_pdf_all/2049.pdf██████-----------------------------| 43.8% Complete
escuela de artes
../thesis_pdf_all/2048.pdf███████----------------------------| 44.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/

escuela de educacion
../thesis_pdf_all/2218.pdf████████████████-------------------| 63.3% Complete
escuela de educacion
../thesis_pdf_all/2217.pdf████████████████-------------------| 63.6% Complete
escuela de educacion
../thesis_pdf_all/2216.pdf████████████████-------------------| 63.8% Complete
escuela de educacion
../thesis_pdf_all/2215.pdf█████████████████------------------| 64.1% Complete
escuela de educacion
../thesis_pdf_all/2242.pdf█████████████████------------------| 64.4% Complete
escuela de educacion
../thesis_pdf_all/2243.pdf█████████████████------------------| 64.6% Complete
escuela de educacion
../thesis_pdf_all/2244.pdf█████████████████------------------| 64.9% Complete
escuela de educacion
../thesis_pdf_all/2259.pdf█████████████████------------------| 65.1% Complete
escuela de educacion
../thesis_pdf_all/2271.pdf█████████████████------------------| 65.4% Complete
escuela de educacion
../thesis_pdf_all/2270.pdf█████████████████------------------| 65.6% Complete
escuela de

escuela de ingenieria de petroleo
../thesis_pdf_all/2196.pdf███████████████████████████--------| 84.6% Complete
../thesis_pdf_all/2195.pdf███████████████████████████--------| 84.9% Complete
../thesis_pdf_all/2194.pdf███████████████████████████--------| 85.1% Complete
escuela de agronomia
../thesis_pdf_all/2193.pdf███████████████████████████--------| 85.4% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2192.pdf███████████████████████████--------| 85.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2191.pdf███████████████████████████--------| 85.9% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2190.pdf████████████████████████████-------| 86.2% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2189.pdf████████████████████████████-------| 86.4% Complete
../thesis_pdf_all/2188.pdf████████████████████████████-------| 86.7% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2187.pdf████████████████████████████-------| 86.9% Comp

In [21]:
df_splited[11].to_csv("./data/splitted/thesis_df_splited11_with_school.csv", index=False)

In [22]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[12])
values_c_12 = check_vec("../"+df_splited[12]["path"],l)
df_splited[12]['school_simple'] = values_c_12.tolist()

../thesis_pdf_all/1888.pdf-----------------------------------| 0.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1887.pdf-----------------------------------| 0.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/1886.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/1885.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1884.pdf-----------------------------------| 1.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/1883.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1882.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1881.pdf-----------------------------------| 2.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/1880.pdf-----------------------------------| 2.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1879.p

../thesis_pdf_all/1990.pdf-----------------------------------| 21.0% Complete
escuela de enfermeria
../thesis_pdf_all/1989.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1988.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1987.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1986.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1985.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1984.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1983.pdf-----------------------------------| 22.8% Complete
escuela de enfermeria
../thesis_pdf_all/1982.pdf-----------------------------------| 23.1% Complete
escuela de enfermeria
../thesis_pdf_all/1981.pdf--

escuela de ingenieria de petroleo
../thesis_pdf_all/1910.pdf█████------------------------------| 41.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1909.pdf█████------------------------------| 41.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1908.pdf█████------------------------------| 41.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1907.pdf██████-----------------------------| 42.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1933.pdf██████-----------------------------| 42.3% Complete
escuela de artes
../thesis_pdf_all/1935.pdf██████-----------------------------| 42.6% Complete
escuela de artes
../thesis_pdf_all/1964.pdf██████-----------------------------| 42.8% Complete
escuela de enfermeria
../thesis_pdf_all/1950.pdf██████-----------------------------| 43.1% Complete
../thesis_pdf_all/1963.pdf██████-----------------------------| 43.3% Complete
escuela de enfermeria
../thesis_pdf_all/1962.pdf██████------------------

escuela de ingenieria electrica
../thesis_pdf_all/524.pdf█████████████████-------------------| 62.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/511.pdf█████████████████-------------------| 62.3% Complete
../thesis_pdf_all/523.pdf█████████████████-------------------| 62.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/522.pdf█████████████████-------------------| 62.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/521.pdf█████████████████-------------------| 63.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/520.pdf█████████████████-------------------| 63.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/519.pdf█████████████████-------------------| 63.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/518.pdf█████████████████-------------------| 63.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/517.pdf██████████████████------------------| 64.1% Complete
escuela de ingenieria electrica
../thesis_

escuela de comunicacion social
../thesis_pdf_all/743.pdf███████████████████████████---------| 82.6% Complete
escuela de comunicacion social
../thesis_pdf_all/742.pdf███████████████████████████---------| 82.8% Complete
escuela de comunicacion social
../thesis_pdf_all/741.pdf███████████████████████████---------| 83.1% Complete
escuela de comunicacion social
../thesis_pdf_all/740.pdf███████████████████████████---------| 83.3% Complete
escuela de comunicacion social
../thesis_pdf_all/730.pdf███████████████████████████---------| 83.6% Complete
escuela de comunicacion social
../thesis_pdf_all/729.pdf███████████████████████████---------| 83.8% Complete
escuela de comunicacion social
../thesis_pdf_all/727.pdf████████████████████████████--------| 84.1% Complete
escuela de comunicacion social
../thesis_pdf_all/726.pdf████████████████████████████--------| 84.4% Complete
escuela de comunicacion social
../thesis_pdf_all/724.pdf████████████████████████████--------| 84.6% Complete
escuela de comunica

In [23]:
df_splited[12].to_csv("./data/splitted/thesis_df_splited12_with_school.csv", index=False)

In [24]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[13])
values_c_13 = check_vec("../"+df_splited[13]["path"],l)
df_splited[13]['school_simple'] = values_c_13.tolist()

../thesis_pdf_all/655.pdf------------------------------------| 0.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/654.pdf------------------------------------| 0.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/653.pdf------------------------------------| 0.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/652.pdf------------------------------------| 1.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/651.pdf------------------------------------| 1.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/650.pdf------------------------------------| 1.5% Complete
../thesis_pdf_all/649.pdf------------------------------------| 1.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/648.pdf------------------------------------| 2.1% Complete
escuela de enfermeria
../thesis_pdf_all/647.pdf------------------------------------| 2.3% Complete
escuela de antropologia
../thesis

escuela de ingenieria de petroleo
../thesis_pdf_all/91.pdf-------------------------------------| 20.5% Complete
escuela de ingenieria quimica
../thesis_pdf_all/92.pdf-------------------------------------| 20.8% Complete
../thesis_pdf_all/93.pdf-------------------------------------| 21.0% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/108.pdf------------------------------------| 21.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/120.pdf------------------------------------| 21.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/119.pdf------------------------------------| 21.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/118.pdf------------------------------------| 22.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/117.pdf------------------------------------| 22.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/116.pdf------------------------------------| 22.6% C

escuela de ingenieria quimica
../thesis_pdf_all/49.pdf██████-------------------------------| 39.7% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/48.pdf███████------------------------------| 40.0% Complete
../thesis_pdf_all/46.pdf███████------------------------------| 40.3% Complete
../thesis_pdf_all/33.pdf███████------------------------------| 40.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/45.pdf███████------------------------------| 40.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/44.pdf███████------------------------------| 41.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/43.pdf███████------------------------------| 41.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/42.pdf███████------------------------------| 41.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/41.pdf███████------------------------------| 41.8% Complete
../thesis_pdf_all/40.pdf████████-------------------------

escuela de ingenieria quimica
../thesis_pdf_all/150.pdf████████████████--------------------| 60.8% Complete
../thesis_pdf_all/149.pdf████████████████--------------------| 61.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/148.pdf████████████████--------------------| 61.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/147.pdf████████████████--------------------| 61.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/146.pdf████████████████--------------------| 61.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/145.pdf█████████████████-------------------| 62.1% Complete
../thesis_pdf_all/144.pdf█████████████████-------------------| 62.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/143.pdf█████████████████-------------------| 62.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/142.pdf█████████████████-------------------| 62.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/141.pdf█████████████████-----

escuela de artes
../thesis_pdf_all/1109.pdf██████████████████████████---------| 82.1% Complete
escuela de artes
../thesis_pdf_all/1108.pdf██████████████████████████---------| 82.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1107.pdf██████████████████████████---------| 82.6% Complete
escuela de artes
../thesis_pdf_all/1106.pdf██████████████████████████---------| 82.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1134.pdf██████████████████████████---------| 83.1% Complete
escuela de artes
../thesis_pdf_all/1135.pdf██████████████████████████---------| 83.3% Complete
../thesis_pdf_all/1136.pdf██████████████████████████---------| 83.6% Complete
escuela de artes
../thesis_pdf_all/1151.pdf██████████████████████████---------| 83.8% Complete
escuela de artes
../thesis_pdf_all/1163.pdf███████████████████████████--------| 84.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1162.pdf███████████████████████████--------| 84.4% Complete


In [27]:
df_splited[13].to_csv("./data/splitted/thesis_df_splited13_with_school.csv", index=False)

In [28]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[14])
values_c_14 = check_vec("../"+df_splited[14]["path"],l)
df_splited[14]['school_simple'] = values_c_14.tolist()

../thesis_pdf_all/1099.pdf-----------------------------------| 0.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1098.pdf-----------------------------------| 0.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1097.pdf-----------------------------------| 0.8% Complete
escuela de artes
../thesis_pdf_all/1096.pdf-----------------------------------| 1.0% Complete
escuela de artes
../thesis_pdf_all/1095.pdf-----------------------------------| 1.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1094.pdf-----------------------------------| 1.5% Complete
escuela de artes
../thesis_pdf_all/1093.pdf-----------------------------------| 1.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1092.pdf-----------------------------------| 2.1% Complete
../thesis_pdf_all/1091.pdf-----------------------------------| 2.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1090.pdf--------------------

escuela de ingenieria de petroleo
../thesis_pdf_all/1267.pdf-----------------------------------| 20.8% Complete
escuela de enfermeria
../thesis_pdf_all/1266.pdf-----------------------------------| 21.0% Complete
escuela de enfermeria
../thesis_pdf_all/1265.pdf-----------------------------------| 21.3% Complete
escuela de enfermeria
../thesis_pdf_all/1264.pdf-----------------------------------| 21.5% Complete
../thesis_pdf_all/1263.pdf-----------------------------------| 21.8% Complete
escuela de enfermeria
../thesis_pdf_all/1229.pdf-----------------------------------| 22.1% Complete
escuela de enfermeria
../thesis_pdf_all/1227.pdf-----------------------------------| 22.3% Complete
escuela de enfermeria
../thesis_pdf_all/1167.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1226.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/1194.pdf-------------

../thesis_pdf_all/875.pdf██████------------------------------| 41.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/874.pdf██████------------------------------| 41.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/873.pdf███████-----------------------------| 42.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/872.pdf███████-----------------------------| 42.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/871.pdf███████-----------------------------| 42.6% Complete
../thesis_pdf_all/870.pdf███████-----------------------------| 42.8% Complete
../thesis_pdf_all/869.pdf███████-----------------------------| 43.1% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/868.pdf███████-----------------------------| 43.3% Complete
escuela de enfermeria
../thesis_pdf_all/867.pdf███████-----------------------------| 43.6% Complete
escuela de enfermeria
../thesis_pdf_all/866.pdf███████-----------------------------| 4

escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/854.pdf█████████████████-------------------| 62.6% Complete
escuela de enfermeria
../thesis_pdf_all/853.pdf█████████████████-------------------| 62.8% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/852.pdf█████████████████-------------------| 63.1% Complete
escuela de enfermeria
../thesis_pdf_all/851.pdf█████████████████-------------------| 63.3% Complete
escuela de enfermeria
../thesis_pdf_all/850.pdf█████████████████-------------------| 63.6% Complete
escuela de ingenieria quimica
../thesis_pdf_all/849.pdf█████████████████-------------------| 63.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/848.pdf██████████████████------------------| 64.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/847.pdf██████████████████------------------| 64.4% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/846.pdf█████████

escuela de comunicacion social
../thesis_pdf_all/1024.pdf█████████████████████████----------| 81.8% Complete
../thesis_pdf_all/1023.pdf██████████████████████████---------| 82.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1022.pdf██████████████████████████---------| 82.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1021.pdf██████████████████████████---------| 82.6% Complete
escuela de comunicacion social
../thesis_pdf_all/1020.pdf██████████████████████████---------| 82.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1019.pdf██████████████████████████---------| 83.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1018.pdf██████████████████████████---------| 83.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1017.pdf██████████████████████████---------| 83.6% Complete
escuela de comunicacion social
../thesis_pdf_all/1016.pdf██████████████████████████---------| 83.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1

In [29]:
df_splited[14].to_csv("./data/splitted/thesis_df_splited14_with_school.csv", index=False)

In [15]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[15])
values_c_15 = check_vec("../"+df_splited[15]["path"],l)
df_splited[15]['school_simple'] = values_c_15.tolist()

../thesis_pdf_all/2274.pdf-----------------------------------| 0.3% Complete
escuela de educacion
../thesis_pdf_all/4242.pdf-----------------------------------| 0.5% Complete
escuela de computacion
../thesis_pdf_all/3564.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3592.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria civil
../thesis_pdf_all/3591.pdf-----------------------------------| 1.3% Complete
../thesis_pdf_all/3590.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/3589.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3588.pdf-----------------------------------| 2.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3587.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/3586.pdf-----------------------------------| 2.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/

../thesis_pdf_all/3514.pdf-----------------------------------| 21.0% Complete
escuela de ingenieria civil
../thesis_pdf_all/3513.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3512.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/3511.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3510.pdf-----------------------------------| 22.1% Complete
../thesis_pdf_all/3509.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3508.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/3507.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3506.pdf-----------------------------------| 23.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3505.pdf-----------------------------------| 23.3% Compl

../thesis_pdf_all/3733.pdf██████-----------------------------| 43.1% Complete
escuela de quimica
../thesis_pdf_all/3731.pdf██████-----------------------------| 43.3% Complete
escuela de quimica
../thesis_pdf_all/3718.pdf██████-----------------------------| 43.6% Complete
../thesis_pdf_all/3730.pdf██████-----------------------------| 43.8% Complete
../thesis_pdf_all/3729.pdf███████----------------------------| 44.1% Complete
../thesis_pdf_all/3728.pdf███████----------------------------| 44.4% Complete
escuela de quimica
../thesis_pdf_all/3727.pdf███████----------------------------| 44.6% Complete
../thesis_pdf_all/3726.pdf███████----------------------------| 44.9% Complete
escuela de quimica
../thesis_pdf_all/3725.pdf███████----------------------------| 45.1% Complete
../thesis_pdf_all/3724.pdf███████----------------------------| 45.4% Complete
../thesis_pdf_all/3723.pdf███████----------------------------| 45.6% Complete
escuela de quimica
../thesis_pdf_all/3722.pdf███████--------------

escuela de artes
../thesis_pdf_all/3341.pdf█████████████████------------------| 65.4% Complete
escuela de artes
../thesis_pdf_all/3340.pdf█████████████████------------------| 65.6% Complete
escuela de artes
../thesis_pdf_all/3339.pdf█████████████████------------------| 65.9% Complete
escuela de artes
../thesis_pdf_all/3338.pdf██████████████████-----------------| 66.2% Complete
escuela de artes
../thesis_pdf_all/3337.pdf██████████████████-----------------| 66.4% Complete
escuela de filosofia
../thesis_pdf_all/3336.pdf██████████████████-----------------| 66.7% Complete
escuela de artes
../thesis_pdf_all/3335.pdf██████████████████-----------------| 66.9% Complete
escuela de psicologia
../thesis_pdf_all/3334.pdf██████████████████-----------------| 67.2% Complete
escuela de artes
../thesis_pdf_all/3333.pdf██████████████████-----------------| 67.4% Complete
escuela de enfermeria
../thesis_pdf_all/3332.pdf██████████████████-----------------| 67.7% Complete
escuela de enfermeria
../thesis_pdf_

../thesis_pdf_all/3317.pdf████████████████████████████-------| 87.2% Complete
escuela de quimica
../thesis_pdf_all/3316.pdf████████████████████████████-------| 87.4% Complete
escuela de fisica
../thesis_pdf_all/3315.pdf████████████████████████████-------| 87.7% Complete
../thesis_pdf_all/3314.pdf████████████████████████████-------| 87.9% Complete
../thesis_pdf_all/3313.pdf█████████████████████████████------| 88.2% Complete
escuela de quimica
../thesis_pdf_all/3312.pdf█████████████████████████████------| 88.5% Complete
escuela de comunicacion social
../thesis_pdf_all/3311.pdf█████████████████████████████------| 88.7% Complete
escuela de comunicacion social
../thesis_pdf_all/3310.pdf█████████████████████████████------| 89.0% Complete
escuela de comunicacion social
../thesis_pdf_all/3309.pdf█████████████████████████████------| 89.2% Complete
escuela de comunicacion social
../thesis_pdf_all/3308.pdf█████████████████████████████------| 89.5% Complete
escuela de computacion
../thesis_pdf_all

In [16]:
df_splited[15].to_csv("./data/splitted/thesis_df_splited15_with_school.csv", index=False)

In [11]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[16])
values_c_16 = check_vec("../"+df_splited[16]["path"],l)
df_splited[16]['school_simple'] = values_c_16.tolist()

../thesis_pdf_all/3449.pdf-----------------------------------| 0.3% Complete
escuela de quimica
../thesis_pdf_all/3448.pdf-----------------------------------| 0.5% Complete
escuela de fisica
../thesis_pdf_all/3447.pdf-----------------------------------| 0.8% Complete
escuela de fisica
../thesis_pdf_all/3446.pdf-----------------------------------| 1.0% Complete
escuela de comunicacion social
../thesis_pdf_all/3445.pdf-----------------------------------| 1.3% Complete
escuela de comunicacion social
../thesis_pdf_all/3444.pdf-----------------------------------| 1.5% Complete
../thesis_pdf_all/3471.pdf-----------------------------------| 1.8% Complete
escuela de fisica
../thesis_pdf_all/3472.pdf-----------------------------------| 2.1% Complete
escuela de computacion
../thesis_pdf_all/3473.pdf-----------------------------------| 2.3% Complete
escuela de quimica
../thesis_pdf_all/3488.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3500.pdf--------

escuela de quimica
../thesis_pdf_all/3413.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3425.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3424.pdf-----------------------------------| 22.3% Complete
../thesis_pdf_all/3423.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/3422.pdf-----------------------------------| 22.8% Complete
../thesis_pdf_all/3421.pdf-----------------------------------| 23.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3420.pdf-----------------------------------| 23.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3419.pdf-----------------------------------| 23.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/3418.pdf-----------------------------------| 23.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3417.pdf-----------------------------------| 24.1% Complete
../th

escuela de ingenieria electrica
../thesis_pdf_all/4017.pdf██████-----------------------------| 42.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4016.pdf██████-----------------------------| 42.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4015.pdf██████-----------------------------| 42.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4014.pdf██████-----------------------------| 43.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4013.pdf██████-----------------------------| 43.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4012.pdf██████-----------------------------| 43.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4011.pdf██████-----------------------------| 43.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4010.pdf███████----------------------------| 44.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4009.pdf███████----------------------------| 44.4% Complete
escuela de

../thesis_pdf_all/4189.pdf████████████████-------------------| 62.3% Complete
escuela de biologia
../thesis_pdf_all/4188.pdf████████████████-------------------| 62.6% Complete
../thesis_pdf_all/4187.pdf████████████████-------------------| 62.8% Complete
escuela de biologia
../thesis_pdf_all/4186.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/4185.pdf████████████████-------------------| 63.3% Complete
escuela de biologia
../thesis_pdf_all/4184.pdf████████████████-------------------| 63.6% Complete
escuela de quimica
../thesis_pdf_all/4183.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4182.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/4210.pdf█████████████████------------------| 64.4% Complete
escuela de biologia
../thesis_pdf_all/4211.pdf█████████████████------------------| 64.6% Complete
escuela de quimica
../t

../thesis_pdf_all/4166.pdf██████████████████████████---------| 83.6% Complete
escuela de comunicacion social
../thesis_pdf_all/4165.pdf██████████████████████████---------| 83.8% Complete
escuela de comunicacion social
../thesis_pdf_all/4163.pdf███████████████████████████--------| 84.1% Complete
escuela de comunicacion social
../thesis_pdf_all/4150.pdf███████████████████████████--------| 84.4% Complete
escuela de comunicacion social
../thesis_pdf_all/4162.pdf███████████████████████████--------| 84.6% Complete
escuela de comunicacion social
../thesis_pdf_all/4161.pdf███████████████████████████--------| 84.9% Complete
escuela de comunicacion social
../thesis_pdf_all/4160.pdf███████████████████████████--------| 85.1% Complete
../thesis_pdf_all/4159.pdf███████████████████████████--------| 85.4% Complete
escuela de comunicacion social
../thesis_pdf_all/4158.pdf███████████████████████████--------| 85.6% Complete
escuela de comunicacion social
../thesis_pdf_all/4157.pdf████████████████████████

In [12]:
df_splited[16].to_csv("./data/splitted/thesis_df_splited16_with_school.csv", index=False)

In [13]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[17])
values_c_17 = check_vec("../"+df_splited[17]["path"],l)
df_splited[17]['school_simple'] = values_c_17.tolist()

../thesis_pdf_all/3852.pdf-----------------------------------| 0.3% Complete
escuela de quimica
../thesis_pdf_all/3851.pdf-----------------------------------| 0.5% Complete
escuela de fisica
../thesis_pdf_all/3850.pdf-----------------------------------| 0.8% Complete
escuela de matematica
../thesis_pdf_all/3849.pdf-----------------------------------| 1.0% Complete
escuela de matematica
../thesis_pdf_all/3848.pdf-----------------------------------| 1.3% Complete
escuela de computacion
../thesis_pdf_all/3847.pdf-----------------------------------| 1.5% Complete
escuela de computacion
../thesis_pdf_all/3846.pdf-----------------------------------| 1.8% Complete
escuela de matematica
../thesis_pdf_all/3845.pdf-----------------------------------| 2.1% Complete
escuela de matematica
../thesis_pdf_all/3844.pdf-----------------------------------| 2.3% Complete
escuela de fisica
../thesis_pdf_all/3843.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3842

../thesis_pdf_all/3951.pdf-----------------------------------| 21.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3950.pdf-----------------------------------| 21.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3949.pdf-----------------------------------| 22.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3948.pdf-----------------------------------| 22.3% Complete
escuela de biologia
../thesis_pdf_all/3947.pdf-----------------------------------| 22.6% Complete
escuela de biologia
../thesis_pdf_all/3946.pdf-----------------------------------| 22.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3945.pdf-----------------------------------| 23.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3944.pdf-----------------------------------| 23.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3943.pdf-----------------------------------| 23.6% Complete
escuela de biologia
../t

escuela de quimica
../thesis_pdf_all/3897.pdf██████-----------------------------| 42.3% Complete
escuela de matematica
../thesis_pdf_all/3899.pdf██████-----------------------------| 42.6% Complete
escuela de fisica
../thesis_pdf_all/3927.pdf██████-----------------------------| 42.8% Complete
escuela de matematica
../thesis_pdf_all/3914.pdf██████-----------------------------| 43.1% Complete
escuela de matematica
../thesis_pdf_all/3926.pdf██████-----------------------------| 43.3% Complete
escuela de quimica
../thesis_pdf_all/3925.pdf██████-----------------------------| 43.6% Complete
escuela de quimica
../thesis_pdf_all/3924.pdf██████-----------------------------| 43.8% Complete
escuela de quimica
../thesis_pdf_all/3923.pdf███████----------------------------| 44.1% Complete
escuela de matematica
../thesis_pdf_all/3922.pdf███████----------------------------| 44.4% Complete
escuela de matematica
../thesis_pdf_all/3921.pdf███████----------------------------| 44.6% Complete
escuela de matem

../thesis_pdf_all/2622.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2621.pdf████████████████-------------------| 63.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2620.pdf████████████████-------------------| 63.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2619.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2618.pdf█████████████████------------------| 64.1% Complete
../thesis_pdf_all/2617.pdf█████████████████------------------| 64.4% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2616.pdf█████████████████------------------| 64.6% Complete
../thesis_pdf_all/2615.pdf█████████████████------------------| 64.9% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2614.pdf█████████████████------------------| 65.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2613.pdf█████████████████------------------| 65.4% Complete
esc

../thesis_pdf_all/2729.pdf██████████████████████████---------| 83.6% Complete
escuela de enfermeria
../thesis_pdf_all/2728.pdf██████████████████████████---------| 83.8% Complete
escuela de computacion
../thesis_pdf_all/2727.pdf███████████████████████████--------| 84.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/2726.pdf███████████████████████████--------| 84.4% Complete
escuela de computacion
../thesis_pdf_all/2725.pdf███████████████████████████--------| 84.6% Complete
escuela de computacion
../thesis_pdf_all/2724.pdf███████████████████████████--------| 84.9% Complete
escuela de computacion
../thesis_pdf_all/2723.pdf███████████████████████████--------| 85.1% Complete
escuela de computacion
../thesis_pdf_all/2722.pdf███████████████████████████--------| 85.4% Complete
../thesis_pdf_all/2720.pdf███████████████████████████--------| 85.6% Complete
escuela de computacion
../thesis_pdf_all/2718.pdf███████████████████████████--------| 85.9% Complete
escuela de computacio

In [14]:
df_splited[17].to_csv("./data/splitted/thesis_df_splited17_with_school.csv", index=False)

In [15]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[18])
values_c_18 = check_vec("../"+df_splited[18]["path"],l)
df_splited[18]['school_simple'] = values_c_18.tolist()

../thesis_pdf_all/2660.pdf-----------------------------------| 0.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2659.pdf-----------------------------------| 0.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2658.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2657.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2656.pdf-----------------------------------| 1.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2655.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2654.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2653.pdf-----------------------------------| 2.1% Complete
../thesis_pdf_all/2652.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/2651.pdf-----------------------------------| 2.6% Complete
../thesis_pdf

escuela de ingenieria mecanica
../thesis_pdf_all/2367.pdf-----------------------------------| 21.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2382.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/2394.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2393.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2392.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2391.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2390.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2389.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2388.pdf-----------------------------------| 23.

escuela de enfermeria
../thesis_pdf_all/2316.pdf█████------------------------------| 41.5% Complete
escuela de enfermeria
../thesis_pdf_all/2315.pdf█████------------------------------| 41.8% Complete
escuela de enfermeria
../thesis_pdf_all/2314.pdf██████-----------------------------| 42.1% Complete
escuela de enfermeria
../thesis_pdf_all/2313.pdf██████-----------------------------| 42.3% Complete
escuela de enfermeria
../thesis_pdf_all/2312.pdf██████-----------------------------| 42.6% Complete
escuela de enfermeria
../thesis_pdf_all/2311.pdf██████-----------------------------| 42.8% Complete
escuela de educacion
../thesis_pdf_all/2310.pdf██████-----------------------------| 43.1% Complete
escuela de enfermeria
../thesis_pdf_all/2309.pdf██████-----------------------------| 43.3% Complete
escuela de enfermeria
../thesis_pdf_all/2308.pdf██████-----------------------------| 43.6% Complete
escuela de enfermeria
../thesis_pdf_all/2395.pdf██████-----------------------------| 43.8% Complete
e

../thesis_pdf_all/2419.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2418.pdf████████████████-------------------| 62.3% Complete
../thesis_pdf_all/2417.pdf████████████████-------------------| 62.6% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/2416.pdf████████████████-------------------| 62.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2415.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/2414.pdf████████████████-------------------| 63.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2413.pdf████████████████-------------------| 63.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2412.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2411.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_al

escuela de biologia
../thesis_pdf_all/3077.pdf██████████████████████████---------| 82.6% Complete
escuela de quimica
../thesis_pdf_all/3076.pdf██████████████████████████---------| 82.8% Complete
escuela de biologia
../thesis_pdf_all/3103.pdf██████████████████████████---------| 83.1% Complete
../thesis_pdf_all/3104.pdf██████████████████████████---------| 83.3% Complete
escuela de biologia
../thesis_pdf_all/3105.pdf██████████████████████████---------| 83.6% Complete
escuela de quimica
../thesis_pdf_all/3120.pdf██████████████████████████---------| 83.8% Complete
escuela de quimica
../thesis_pdf_all/3132.pdf███████████████████████████--------| 84.1% Complete
escuela de biologia
../thesis_pdf_all/3131.pdf███████████████████████████--------| 84.4% Complete
escuela de quimica
../thesis_pdf_all/3130.pdf███████████████████████████--------| 84.6% Complete
escuela de biologia
../thesis_pdf_all/3129.pdf███████████████████████████--------| 84.9% Complete
escuela de biologia
../thesis_pdf_all/3128.p

In [16]:
df_splited[18].to_csv("./data/splitted/thesis_df_splited18_with_school.csv", index=False)

In [17]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[19])
values_c_19 = check_vec("../"+df_splited[19]["path"],l)
df_splited[19]['school_simple'] = values_c_19.tolist()

../thesis_pdf_all/3068.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/3067.pdf-----------------------------------| 0.5% Complete
escuela de biologia
../thesis_pdf_all/3066.pdf-----------------------------------| 0.8% Complete
escuela de biologia
../thesis_pdf_all/3065.pdf-----------------------------------| 1.0% Complete
escuela de biologia
../thesis_pdf_all/3064.pdf-----------------------------------| 1.3% Complete
escuela de biologia
../thesis_pdf_all/3063.pdf-----------------------------------| 1.5% Complete
escuela de biologia
../thesis_pdf_all/3062.pdf-----------------------------------| 1.8% Complete
escuela de biologia
../thesis_pdf_all/3061.pdf-----------------------------------| 2.1% Complete
escuela de quimica
../thesis_pdf_all/3060.pdf-----------------------------------| 2.3% Complete
escuela de biologia
../thesis_pdf_all/3059.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3057.pdf--------------------------

../thesis_pdf_all/3197.pdf-----------------------------------| 22.1% Complete
escuela de quimica
../thesis_pdf_all/3195.pdf-----------------------------------| 22.3% Complete
escuela de quimica
../thesis_pdf_all/3136.pdf-----------------------------------| 22.6% Complete
escuela de quimica
../thesis_pdf_all/3194.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/3163.pdf-----------------------------------| 23.1% Complete
escuela de biologia
../thesis_pdf_all/3162.pdf-----------------------------------| 23.3% Complete
escuela de biologia
../thesis_pdf_all/3161.pdf-----------------------------------| 23.6% Complete
escuela de biologia
../thesis_pdf_all/3160.pdf-----------------------------------| 23.8% Complete
escuela de quimica
../thesis_pdf_all/3159.pdf-----------------------------------| 24.1% Complete
escuela de quimica
../thesis_pdf_all/3158.pdf-----------------------------------| 24.4% Complete
escuela de fisica
../thesis_pdf_all

../thesis_pdf_all/2837.pdf██████-----------------------------| 43.6% Complete
escuela de computacion
../thesis_pdf_all/2836.pdf██████-----------------------------| 43.8% Complete
escuela de computacion
../thesis_pdf_all/2835.pdf███████----------------------------| 44.1% Complete
escuela de computacion
../thesis_pdf_all/2834.pdf███████----------------------------| 44.4% Complete
escuela de computacion
../thesis_pdf_all/2833.pdf███████----------------------------| 44.6% Complete
escuela de computacion
../thesis_pdf_all/2832.pdf███████----------------------------| 44.9% Complete
escuela de computacion
../thesis_pdf_all/2831.pdf███████----------------------------| 45.1% Complete
escuela de computacion
../thesis_pdf_all/2830.pdf███████----------------------------| 45.4% Complete
escuela de quimica
../thesis_pdf_all/2857.pdf███████----------------------------| 45.6% Complete
../thesis_pdf_all/2858.pdf███████----------------------------| 45.9% Complete
escuela de comunicacion social
../thesis

escuela de computacion
../thesis_pdf_all/2815.pdf█████████████████------------------| 64.9% Complete
escuela de biologia
../thesis_pdf_all/2814.pdf█████████████████------------------| 65.1% Complete
escuela de computacion
../thesis_pdf_all/2812.pdf█████████████████------------------| 65.4% Complete
escuela de biologia
../thesis_pdf_all/2799.pdf█████████████████------------------| 65.6% Complete
escuela de computacion
../thesis_pdf_all/2811.pdf█████████████████------------------| 65.9% Complete
escuela de computacion
../thesis_pdf_all/2810.pdf██████████████████-----------------| 66.2% Complete
escuela de computacion
../thesis_pdf_all/2809.pdf██████████████████-----------------| 66.4% Complete
escuela de computacion
../thesis_pdf_all/2808.pdf██████████████████-----------------| 66.7% Complete
escuela de computacion
../thesis_pdf_all/2807.pdf██████████████████-----------------| 66.9% Complete
escuela de biologia
../thesis_pdf_all/2806.pdf██████████████████-----------------| 67.2% Complete

escuela de computacion
../thesis_pdf_all/2915.pdf████████████████████████████-------| 86.2% Complete
escuela de quimica
../thesis_pdf_all/2914.pdf████████████████████████████-------| 86.4% Complete
escuela de computacion
../thesis_pdf_all/2913.pdf████████████████████████████-------| 86.7% Complete
escuela de quimica
../thesis_pdf_all/2912.pdf████████████████████████████-------| 86.9% Complete
escuela de biologia
../thesis_pdf_all/2911.pdf████████████████████████████-------| 87.2% Complete
escuela de biologia
../thesis_pdf_all/2910.pdf████████████████████████████-------| 87.4% Complete
escuela de computacion
../thesis_pdf_all/2909.pdf████████████████████████████-------| 87.7% Complete
escuela de fisica
../thesis_pdf_all/2908.pdf████████████████████████████-------| 87.9% Complete
escuela de computacion
../thesis_pdf_all/2907.pdf█████████████████████████████------| 88.2% Complete
escuela de computacion
../thesis_pdf_all/2906.pdf█████████████████████████████------| 88.5% Complete
escuela d

In [18]:
df_splited[19].to_csv("./data/splitted/thesis_df_splited19_with_school.csv", index=False)

In [19]:
#create a list of values with the school column, the l is the total ammount of thesis to check
# values_c_3 = check_vec("../"+df_splited[3]["path"],l)
# df_splited[3]['school_simple'] = values_c_3.tolist()

In [21]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source0 = "./data/splitted/thesis_df_splited0_with_school.csv"
csv_source1 = "./data/splitted/thesis_df_splited1_with_school.csv"
csv_source2 = "./data/splitted/thesis_df_splited2_with_school.csv"
csv_source3 = "./data/splitted/thesis_df_splited3_with_school.csv"
csv_source4 = "./data/splitted/thesis_df_splited4_with_school.csv"
csv_source5 = "./data/splitted/thesis_df_splited5_with_school.csv"
csv_source6 = "./data/splitted/thesis_df_splited6_with_school.csv"
csv_source7 = "./data/splitted/thesis_df_splited7_with_school.csv"
csv_source8 = "./data/splitted/thesis_df_splited8_with_school.csv"
csv_source9 = "./data/splitted/thesis_df_splited9_with_school.csv"
csv_source10 = "./data/splitted/thesis_df_splited10_with_school.csv"
csv_source11 = "./data/splitted/thesis_df_splited11_with_school.csv"
csv_source12 = "./data/splitted/thesis_df_splited12_with_school.csv"
csv_source13 = "./data/splitted/thesis_df_splited13_with_school.csv"
csv_source14 = "./data/splitted/thesis_df_splited14_with_school.csv"
csv_source15 = "./data/splitted/thesis_df_splited15_with_school.csv"
csv_source16 = "./data/splitted/thesis_df_splited16_with_school.csv"
csv_source17 = "./data/splitted/thesis_df_splited17_with_school.csv"
csv_source18 = "./data/splitted/thesis_df_splited18_with_school.csv"
csv_source19 = "./data/splitted/thesis_df_splited19_with_school.csv"

# 
csv_a = []
csv_a.append(pd.read_csv(csv_source0))
csv_a.append(pd.read_csv(csv_source1))
csv_a.append(pd.read_csv(csv_source2))
csv_a.append(pd.read_csv(csv_source3))
csv_a.append(pd.read_csv(csv_source4))
csv_a.append(pd.read_csv(csv_source5))
csv_a.append(pd.read_csv(csv_source6))
csv_a.append(pd.read_csv(csv_source7))
csv_a.append(pd.read_csv(csv_source8))
csv_a.append(pd.read_csv(csv_source9))
csv_a.append(pd.read_csv(csv_source10))
csv_a.append(pd.read_csv(csv_source11))
csv_a.append(pd.read_csv(csv_source12))
csv_a.append(pd.read_csv(csv_source13))
csv_a.append(pd.read_csv(csv_source14))
csv_a.append(pd.read_csv(csv_source15))
csv_a.append(pd.read_csv(csv_source16))
csv_a.append(pd.read_csv(csv_source17))
csv_a.append(pd.read_csv(csv_source18))
csv_a.append(pd.read_csv(csv_source19))



In [22]:
csv_a[1]

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,5423,27-Mar-2017,Aplicación de LIBS en estudio geoquímico de al...,"Cornejo Ordáz, José Javier",4.65 MB,http://saber.ucv.ve/handle/10872/15411,http://saber.ucv.ve/bitstream/10872/15411/1/Ma...,,thesis_pdf_all/5423.pdf,True,True,False,escuela de quimica
1,5422,27-Mar-2017,Desarrollo de un proceso SCT-CPO para la oxida...,"Estrella, Y. Rafael, E.",2.72 MB,http://saber.ucv.ve/handle/10872/15410,http://saber.ucv.ve/bitstream/10872/15410/1/TE...,,thesis_pdf_all/5422.pdf,True,True,False,No School
2,5421,27-Mar-2017,Estudio de formación de complejos de vanadio (...,"Caraballo, Yerimber",1.48 MB,http://saber.ucv.ve/handle/10872/15409,http://saber.ucv.ve/bitstream/10872/15409/3/TE...,,thesis_pdf_all/5421.pdf,True,True,False,escuela de quimica
3,5420,27-Mar-2017,Diseño de una metodología técnica para la eval...,"Esperante C., Isabel C.",2.83 MB,http://saber.ucv.ve/handle/10872/15408,http://saber.ucv.ve/bitstream/10872/15408/1/Mi...,,thesis_pdf_all/5420.pdf,True,True,False,escuela de ingenieria quimica
4,5419,27-Mar-2017,Factibilidad del uso de la técnica multicompon...,"Ramos A., Simón E.",7.26 MB,http://saber.ucv.ve/handle/10872/15406,http://saber.ucv.ve/bitstream/10872/15406/1/TE...,,thesis_pdf_all/5419.pdf,True,True,False,"escuela de geologia, minas y geofisica"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,5844,20-Sep-2017,Identificación de petrofacies y fluidos de for...,"Rincón S., Manuel M.",8.34 MB,http://saber.ucv.ve/handle/10872/16386,http://saber.ucv.ve/bitstream/10872/16386/1/Te...,,thesis_pdf_all/5844.pdf,True,True,False,"escuela de geologia, minas y geofisica"
386,5843,20-Sep-2017,"Diseño, construcción y calibración de equipo p...","Rivadeneyra A., Eduardo M.",4.73 MB,http://saber.ucv.ve/handle/10872/16385,http://saber.ucv.ve/bitstream/10872/16385/1/Tr...,,thesis_pdf_all/5843.pdf,True,True,False,"escuela de geologia, minas y geofisica"
387,5841,20-Sep-2017,Aplicación de radar de penetración de suelos (...,"Sánchez M., Kerly L.",4.79 MB,http://saber.ucv.ve/handle/10872/16383,http://saber.ucv.ve/bitstream/10872/16383/1/Te...,,thesis_pdf_all/5841.pdf,True,True,False,"escuela de geologia, minas y geofisica"
388,5828,19-Sep-2017,Establecimiento de los parámetros mineros-geom...,"Acosta G., Magda C.",4.27 MB,http://saber.ucv.ve/handle/10872/16328,http://saber.ucv.ve/bitstream/10872/16328/1/TE...,,thesis_pdf_all/5828.pdf,True,True,False,"escuela de geologia, minas y geofisica"


In [23]:
b = pd.concat(csv_a,axis=0)
b

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
1,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False,"escuela de geologia, minas y geofisica"
2,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False,escuela de quimica
3,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False,escuela de quimica
4,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False,escuela de quimica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2925,16-Dec-2014,Identificación y clonación del gen que codific...,"Ibarra, Ana V.",3.41 MB,http://saber.ucv.ve/handle/10872/8005,http://saber.ucv.ve/bitstream/10872/8005/1/Tes...,,thesis_pdf_all/2925.pdf,True,True,False,No School
386,2924,16-Dec-2014,Estudio del desarrollo postnatal de la extensi...,"Pernía, Marianny J.",6.14 MB,http://saber.ucv.ve/handle/10872/7995,http://saber.ucv.ve/bitstream/10872/7995/1/Tes...,,thesis_pdf_all/2924.pdf,True,True,False,escuela de biologia
387,2923,16-Dec-2014,Zooplancton en seis cuerpos de agua de la regi...,"Baptista, Esther N.",3.43 MB,http://saber.ucv.ve/handle/10872/8006,http://saber.ucv.ve/bitstream/10872/8006/1/Tes...,,thesis_pdf_all/2923.pdf,True,True,False,escuela de biologia
388,2922,15-Dec-2014,Distribución de la enzima 3α-hidroxiesteroide ...,"Alcalá, Katherine A.",30.67 MB,http://saber.ucv.ve/handle/10872/7989,http://saber.ucv.ve/bitstream/10872/7989/1/Tes...,,thesis_pdf_all/2922.pdf,True,True,False,escuela de biologia


In [24]:
df = b[b['school_simple']!='No School']
df

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
1,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False,"escuela de geologia, minas y geofisica"
2,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False,escuela de quimica
3,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False,escuela de quimica
4,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False,escuela de quimica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,2926,16-Dec-2014,Identificación de proteínas que interaccionan ...,"Navas, Victoria H.",2 MB,http://saber.ucv.ve/handle/10872/8000,http://saber.ucv.ve/bitstream/10872/8000/1/Tes...,,thesis_pdf_all/2926.pdf,True,True,False,escuela de biologia
386,2924,16-Dec-2014,Estudio del desarrollo postnatal de la extensi...,"Pernía, Marianny J.",6.14 MB,http://saber.ucv.ve/handle/10872/7995,http://saber.ucv.ve/bitstream/10872/7995/1/Tes...,,thesis_pdf_all/2924.pdf,True,True,False,escuela de biologia
387,2923,16-Dec-2014,Zooplancton en seis cuerpos de agua de la regi...,"Baptista, Esther N.",3.43 MB,http://saber.ucv.ve/handle/10872/8006,http://saber.ucv.ve/bitstream/10872/8006/1/Tes...,,thesis_pdf_all/2923.pdf,True,True,False,escuela de biologia
388,2922,15-Dec-2014,Distribución de la enzima 3α-hidroxiesteroide ...,"Alcalá, Katherine A.",30.67 MB,http://saber.ucv.ve/handle/10872/7989,http://saber.ucv.ve/bitstream/10872/7989/1/Tes...,,thesis_pdf_all/2922.pdf,True,True,False,escuela de biologia


In [25]:
df = df[df['school_simple']!='None']

In [26]:
df[df['school_simple']=='None']

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple


In [27]:
df

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
1,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False,"escuela de geologia, minas y geofisica"
2,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False,escuela de quimica
3,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False,escuela de quimica
4,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False,escuela de quimica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,2926,16-Dec-2014,Identificación de proteínas que interaccionan ...,"Navas, Victoria H.",2 MB,http://saber.ucv.ve/handle/10872/8000,http://saber.ucv.ve/bitstream/10872/8000/1/Tes...,,thesis_pdf_all/2926.pdf,True,True,False,escuela de biologia
386,2924,16-Dec-2014,Estudio del desarrollo postnatal de la extensi...,"Pernía, Marianny J.",6.14 MB,http://saber.ucv.ve/handle/10872/7995,http://saber.ucv.ve/bitstream/10872/7995/1/Tes...,,thesis_pdf_all/2924.pdf,True,True,False,escuela de biologia
387,2923,16-Dec-2014,Zooplancton en seis cuerpos de agua de la regi...,"Baptista, Esther N.",3.43 MB,http://saber.ucv.ve/handle/10872/8006,http://saber.ucv.ve/bitstream/10872/8006/1/Tes...,,thesis_pdf_all/2923.pdf,True,True,False,escuela de biologia
388,2922,15-Dec-2014,Distribución de la enzima 3α-hidroxiesteroide ...,"Alcalá, Katherine A.",30.67 MB,http://saber.ucv.ve/handle/10872/7989,http://saber.ucv.ve/bitstream/10872/7989/1/Tes...,,thesis_pdf_all/2922.pdf,True,True,False,escuela de biologia


In [28]:
#append to dataframe the column of the matching school and create a csv
#df['school_simple'] = values_c.tolist()
df.to_csv("./data/thesis_7103_with_school.csv", index=False)

In [29]:
## get dataframe with only schools tag thesis
csv_source = "./data/thesis_7103_with_school.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [30]:
#size of data frame
l = len(df.index)
l

7103

In [31]:
df = df[df['school_simple']!='No School']
df

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
4731,95,20-Jun-2011,Estudio de la influencia de las condiciones de...,"Hernández F., Adamarlys; Jiménez G., Harold",2.46 MB,http://saber.ucv.ve/handle/10872/160,http://saber.ucv.ve/bitstream/10872/160/1/TEG.pdf,,thesis_pdf_all/95.pdf,True,True,False,escuela de ingenieria quimica
4742,22,31-May-2011,Efecto de los rayos gamma (g) sobre el comport...,"Rodríguez G., Marlene C.",1.62 MB,http://saber.ucv.ve/handle/10872/42,http://saber.ucv.ve/bitstream/10872/42/1/Tesis...,,thesis_pdf_all/22.pdf,True,True,False,escuela de ingenieria quimica
4741,24,31-May-2011,Determinación de registros pseudo sónicos a pa...,"Urbina P., Eros R.",11.62 MB,http://saber.ucv.ve/handle/10872/53,http://saber.ucv.ve/bitstream/10872/53/1/TRABA...,,thesis_pdf_all/24.pdf,True,True,False,"escuela de geologia, minas y geofisica"
4740,25,31-May-2011,Propuesta de un sistema de administración de i...,"Sánchez M., José R.",562.96 kB,http://saber.ucv.ve/handle/10872/45,http://saber.ucv.ve/bitstream/10872/45/1/TRABA...,,thesis_pdf_all/25.pdf,True,True,False,escuela de ingenieria mecanica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2364,8190,12-Nov-2020,Magnitud y estructura del abastecimiento exter...,"González V., Jorge C.",1.09 MB,http://saber.ucv.ve/handle/10872/20882,http://saber.ucv.ve/bitstream/10872/20882/1/Ma...,,thesis_pdf_all/8190.pdf,True,True,False,escuela de agronomia
2363,8192,12-Nov-2020,Evaluación del Impacto de Cambios de Cobertura...,"Rojas N., Ladu G.",3.29 MB,http://saber.ucv.ve/handle/10872/20885,http://saber.ucv.ve/bitstream/10872/20885/1/Tr...,,thesis_pdf_all/8192.pdf,True,True,False,escuela de agronomia
2362,8193,18-Nov-2020,Efecto de la sombra sobre la concentración de ...,"Herrera B., Mauro D.",706.17 kB,http://saber.ucv.ve/handle/10872/20888,http://saber.ucv.ve/bitstream/10872/20888/1/Te...,,thesis_pdf_all/8193.pdf,True,True,False,escuela de agronomia
2361,8196,18-Nov-2020,Evaluación del comportamiento ecofisiológico d...,"Alfonzo, Miguel",2.14 MB,http://saber.ucv.ve/handle/10872/20891,http://saber.ucv.ve/bitstream/10872/20891/1/TE...,,thesis_pdf_all/8196.pdf,True,True,False,escuela de agronomia


In [32]:
schools.create_dicts()

In [33]:
schools.escuelas_unaccent_dict

{'escuela de ingenieria civil': 0,
 'escuela de ingenieria electrica': 1,
 'escuela de geologia, minas y geofisica': 2,
 'escuela de ingenieria quimica': 3,
 'escuela de ingenieria de petroleo': 4,
 'escuela de ingenieria mecanica': 5,
 'escuela de ingenieria metalurgica y ciencia de los materiales': 6,
 'escuela de agronomia': 7,
 'escuela de arquitectura': 8,
 'escuela de biologia': 9,
 'escuela de computacion': 10,
 'escuela de fisica': 11,
 'escuela de geoquimica': 12,
 'escuela de matematica': 13,
 'escuela de quimica': 14,
 'escuela de administracion y contaduria': 15,
 'escuela de antropologia': 16,
 'escuela de estadistica y ciencias actuariales': 17,
 'escuela de economia': 18,
 'escuela de estudios internacionales': 19,
 'escuela de sociologia': 20,
 'escuela de trabajo social': 21,
 'escuela de derecho': 22,
 'escuela de estudios politicos y administrativo': 23,
 'escuela de ciencias veterinarias': 24,
 'escuela de farmacia': 25,
 'escuela de artes': 26,
 'escuela de bibliot

In [34]:
# make the correct data frame first then vectorize
check_vec_accents = np.vectorize(schools.set_schools_accents)
schools.create_dicts()

In [35]:
df['school_simple']

0              escuela de ingenieria electrica
4731             escuela de ingenieria quimica
4742             escuela de ingenieria quimica
4741    escuela de geologia, minas y geofisica
4740            escuela de ingenieria mecanica
                         ...                  
2364                      escuela de agronomia
2363                      escuela de agronomia
2362                      escuela de agronomia
2361                      escuela de agronomia
7102                     escuela de sociologia
Name: school_simple, Length: 7103, dtype: object

In [36]:
schools.create_dicts()

In [37]:
values_accent = check_vec_accents(df['school_simple'],l)

Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 

In [38]:
values_accent

array(['Escuela de Ingeniería Eléctrica', 'Escuela de Ingeniería Química',
       'Escuela de Ingeniería Química', ..., 'Escuela de Agronomía',
       'Escuela de Agronomía', 'Escuela de Sociología'], dtype='<U61')

In [39]:
#append to dataframe the column of the matching school and create a csv with the correct name
df['school_complex'] = values_accent.tolist()
df.to_csv("./data/thesis_7103_with_resumen_school_complex.csv", index=False)