In [1]:
import json
import pdfplumber
import spacy
import pandas as pd
import numpy as np
import os
import unidecode
import PyPDF2
from tika import parser
from spacy.matcher import PhraseMatcher
from spacy import displacy
from progress_bar.progress_bar import printProgressBar
import torch
import gc
import multiprocessing
import sys

class School:
    def __init__(self, file_source):
        file = open(file_source, "r")
        file = json.load(file)
        temp_list = []
        for facultad in file:
            temp_list.append(facultad['escuela'])
        #print(facultad['escuela'])
        self.escuelas = [item for sublist in temp_list for item in sublist] # make the list flat
        #have the escuelas with accents in the correct form here
        self.escuelas_accent = self.escuelas
        
        print(self.escuelas)
        self.i = 0
        self.j = 0
        self.k = 0
        self.p = 0
        self.first = True
        
    def create_dictionary(self, schools):
        myDict = dict((e,i) for i,e in enumerate(schools))
        return myDict

    def unaccent_list(self, accent_list):
        unaccented_schools = []
        for sch in accent_list:
            unaccented_schools.append(unidecode.unidecode(sch).lower())
        return unaccented_schools
    
    def set_school_to_unaccent(self):
        self.escuelas = self.unaccent_list(self.escuelas)
        
    def create_dicts(self):
        #create the dicts only when schools are unaccented
        self.escuelas_unaccent_dict = self.create_dictionary(self.escuelas)
        self.escuelas_accent_dict = self.create_dictionary(self.escuelas_accent)
        
    def set_schools_accents(self, row, l):
        self.k+= 1
        printProgressBar(self.k, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        
        index = self.escuelas_unaccent_dict.get(row.lower())
        key_list = list(self.escuelas_accent_dict.keys())
        val_list = list(self.escuelas_accent_dict.values())
        try:
            position = val_list.index(index)
            return key_list[position]
        except:
            return None
        #return the value of the position, example in dict '{..., Escuela de enfermería: 37, ...}' it will return 
        #'Escuela de enfermería'
    
    def clean_spaces_text(self, text):
        new_text = " ".join(text.split())
        return(new_text)
    
    def set_nlp(self, model):
        self.nlp_model = spacy.load(model)
        
    def get_nlp(self):
        retrun 
        
    def set_matcher(self):
        self.matcher = PhraseMatcher(self.nlp_model.vocab, attr="LOWER")
        patterns = [self.nlp_model(name) for name in self.escuelas]
        #print(patterns)
        self.matcher.add("ESC", patterns)
        
    def check_file_tika(self, file_source, l):
        try:
            gc.collect()
            torch.cuda.empty_cache()
            self.p+=1
            if self.first:
                self.first = False
            if self.p >= l:
                self.first = True
                self.p = 0
            printProgressBar(self.p, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            pages_10 = []
            pages_10_l = []
            school_name_of_file = ""
            print(file_source)
            parsed_pdf = parser.from_file(file_source)
            data = parsed_pdf['content']
            if data == None:
                return 'None'
            new_data = self.clean_spaces_text(data)
            #print(new_data)
            #print(new_data[0:90000])
            new_data = unidecode.unidecode(new_data).lower()
            #print("len of data", len(new_data))
            #first 10 pages
            first_data = new_data[:6000]
            #print(new_data)
            doc = self.nlp_model(first_data)
            #print(self.matcher(doc))
            for match_id, start, end in self.matcher(doc):
                print(doc[start:end])
                return(doc[start:end])
            #last 10 pages
            last_data = new_data[:-6000]
            #print(new_data)
            doc = self.nlp_model(last_data)
            #print(self.matcher(doc))
            for match_id, start, end in self.matcher(doc):
                print(doc[start:end])
                return(doc[start:end])
            return 'No School'
        except:
            return 'No School'
        
    def check_file(self, file_source, l):
        try:
            self.i+=1
            if self.first:
                self.first = False
            if self.i >= l:
                self.first = True
                self.i = 0
            printProgressBar(self.i, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
            pages_10 = []
            pages_10_l = []
            school_name_of_file = ""
            with pdfplumber.open(file_source) as pdf:
                for i in range(0,10):
                    print(pdf.pages[i].extract_text())
                    pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
                for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                    pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
                #first 10 pages
                pages_10_u = self.unaccent_list(pages_10)
                for page in pages_10_u:
                    doc = self.nlp_model(page)
                    if len(self.matcher(doc)) >=1:
                        for match_id, start, end in self.matcher(doc):
                            return(doc[start:end]) #returns at the first instance
                #last 10 pages
                pages_10_l = self.unaccent_list(pages_10_l)
                for page in pages_10_l:
                    doc = self.nlp_model(page)
                    if len(self.matcher(doc)) >=1:
                        for match_id, start, end in self.matcher(doc):
                            print(doc[start:end])
                            return(doc[start:end]) #returns at the first instance
            return "No school"
        except:
            return "No school"
    def create_training_set(self, file_source, l):
        self.j+=1
        printProgressBar(self.j, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
        
        #first 10 pages
        pages_10_u = self.unaccent_list(pages_10)
        for page in pages_10_u:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        #last 10 pages
        pages_10_l = self.unaccent_list(pages_10_l)
        for page in pages_10_l:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        return "No school"
    def print_training_set(self):
        print(self.training_set)
        

In [2]:
spacy.prefer_gpu()

True

In [3]:
schools = School("data/escuelas.json")

['Escuela de Ingeniería Civil', 'Escuela de Ingeniería Eléctrica', 'Escuela de Geología, Minas y Geofísica', 'Escuela de Ingeniería Química', 'Escuela de Ingeniería de Petroleo', 'Escuela de Ingeniería Mecanica', 'Escuela de Ingeniería Metalúrgica y Ciencia de los Materiales', 'Escuela de Agronomía', 'Escuela de Arquitectura', 'Escuela de Biología', 'Escuela de Computación', 'Escuela de Física', 'Escuela de Geoquímica', 'Escuela de Matemática', 'Escuela de Química', 'Escuela de Administración y Contaduría', 'Escuela de Antropología', 'Escuela de Estadística y Ciencias Actuariales', 'Escuela de Economía', 'Escuela de Estudios Internacionales', 'Escuela de Sociología', 'Escuela de Trabajo Social', 'Escuela de Derecho', 'Escuela de Estudios Políticos y Administrativo', 'Escuela de Ciencias Veterinarias', 'Escuela de Farmacia', 'Escuela de Artes', 'Escuela de Bibliotecología y Archivología', 'Escuela de Comunicación Social', 'Escuela de Educación', 'Escuela de Filosofía', 'Escuela de Geogr

In [4]:
#delete accents
schools.set_school_to_unaccent()

In [5]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source = "data/url_thesis_8211_with_pdf_scan_image.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [6]:
#load model set matcher for schools
schools.set_nlp('es_core_news_sm')
#schools.set_nlp('es_dep_news_trf')
schools.set_matcher()

In [7]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [8]:
# EJECUTAR HASTA AQUII!!! BAJA AL SIGUIENTE EN LA LISTA
df_splited = np.array_split(df, 20)

In [9]:
#df_splited = np.array_split(df, 20)

In [10]:
#schools = School("data/escuelas.json")

In [11]:
#delete accents
#schools.set_school_to_unaccent()

In [12]:
# #make this fucntion a vectorize so it can run in a data frame
# check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [None]:
# df_splited = np.array_split(df, 20)

In [None]:
#size of data frame
#

In [None]:
#'escuela de ingenieria metalurgica y ciencia de los materiales' == 'escuela de ingenieria metalurgica y ciencias de los materiales'

In [None]:
schools.check_file_tika('../'+'thesis_pdf_all/7055.pdf',1)

In [None]:
# parsed_pdf = parser.from_file('../'+'thesis_pdf_all/200.pdf')
  
# # saving content of pdf
# # you can also bring text only, by parsed_pdf['text'] 
# # parsed_pdf['content'] returns string 
# data = parsed_pdf['content'] 
# #   
# # Printing of content
# print(type(data))
# #print(data)
# new_data = clean_spaces_text(data)
# #print(new_data[0:90000])
# new_data = unidecode.unidecode(new_data).lower()

In [None]:
# doc = self.nlp_model(new_data)

In [None]:
# with open('../'+'thesis_pdf_all/200.pdf', mode='rb') as f:
#     reader = PyPDF2.PdfFileReader(f)
#     page = reader.getPage(1)
#     print(clean_spaces_text(page.extractText()))

In [None]:
schools.check_file('../'+'thesis_pdf_all/7055.pdf',1)

In [None]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file_tika, otypes=[np.ndarray])

In [None]:
#Ignore this
#schools.create_training_set("../thesis_pdf/1.pdf", 1)

In [None]:
df_splited = np.array_split(df, 20)

In [None]:
df_splited[0]

In [None]:
# b = pd.concat([df_splited [0],df_splited [1]],axis=0)
# b

In [None]:
# def process_docs(docs, n_processes=None):
#     # Load the model inside the subprocess, 
#     # as that seems to be the main culprit of the memory issues
#     nlp = schools.set_nlp('es_core_news_sm')

#     if not n_processes:
#         n_processes = multiprocessing.cpu_count()

#     processed_docs = [doc for doc in nlp.pipe(docs, disable=['ner', 'parser'], n_process=n_processes)]


#     # Then do what you wish beyond this point. I end up writing results out to s3.
#     pass

In [None]:
# for x in range(10):
#     # This will spin up a subprocess, 
#     # and everytime it finishes it will release all resources back to the machine.
#     with multiprocessing.Manager() as manager:
#         p = multiprocessing.Process(target=process_docs, args=(docs))
#         p.start()
#         p.join()


In [None]:
# #create a list of values with the school column, the l is the total ammount of thesis to check
# l = len(df_splited[0])
# values_c_0 = check_vec("../"+df_splited[0]["path"],l)
# df_splited[0]['school_simple'] = values_c_0.tolist() #done

In [None]:
# type(values_c_0) 

In [None]:
df_splited[0].to_csv("./data/splitted/thesis_df_splited0_with_school.csv", index=False) #done

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[1])
values_c_1 = check_vec("../"+df_splited[1]["path"],l)
df_splited[1]['school_simple'] = values_c_1.tolist() #done

In [None]:
# df_splited[0]['school_simple'] = values_new.tolist()

In [None]:
# type(df_splited[1])

In [None]:
# df_splited[1]['school_simple'] = values_c_1.tolist()

In [None]:
df_splited[1].to_csv("./data/splitted/thesis_df_splited1_with_school.csv", index=False) #done

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[2])
values_c_2 = check_vec("../"+df_splited[2]["path"],l)
df_splited[2]['school_simple'] = values_c_2.tolist() #done

In [None]:
#type(values_c_2)

In [None]:
df_splited[2].to_csv("./data/splitted/thesis_df_splited2_with_school.csv", index=False) #done

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[3])
values_c_3 = check_vec("../"+df_splited[3]["path"],l)
df_splited[3]['school_simple'] = values_c_3.tolist()

In [None]:
df_splited[3].to_csv("./data/splitted/thesis_df_splited3_with_school.csv", index=False)

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[4])
values_c_4 = check_vec("../"+df_splited[4]["path"],l)
df_splited[4]['school_simple'] = values_c_4.tolist() #done

In [None]:
df_splited[4].to_csv("./data/splitted/thesis_df_splited4_with_school.csv", index=False) #done

In [13]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[5])
values_c_5 = check_vec("../"+df_splited[5]["path"],l)
df_splited[5]['school_simple'] = values_c_5.tolist()

../thesis_pdf_all/6229.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/7521.pdf-----------------------------------| 0.5% Complete
escuela de biologia
../thesis_pdf_all/7551.pdf-----------------------------------| 0.8% Complete
escuela de idiomas modernos
../thesis_pdf_all/7550.pdf-----------------------------------| 1.0% Complete
escuela de idiomas modernos
../thesis_pdf_all/7549.pdf-----------------------------------| 1.3% Complete
escuela de idiomas modernos
../thesis_pdf_all/7548.pdf-----------------------------------| 1.5% Complete
escuela de idiomas modernos
../thesis_pdf_all/7547.pdf-----------------------------------| 1.8% Complete
escuela de quimica
../thesis_pdf_all/7546.pdf-----------------------------------| 2.1% Complete
escuela de quimica
../thesis_pdf_all/7545.pdf-----------------------------------| 2.3% Complete
escuela de quimica
../thesis_pdf_all/7544.pdf-----------------------------------| 2.6% Complete
escuela de idiomas modernos
../thesis_pdf

../thesis_pdf_all/7464.pdf-----------------------------------| 22.1% Complete
escuela de biologia
../thesis_pdf_all/7463.pdf-----------------------------------| 22.3% Complete
escuela de biologia
../thesis_pdf_all/7462.pdf-----------------------------------| 22.6% Complete
escuela de biologia
../thesis_pdf_all/7461.pdf-----------------------------------| 22.8% Complete
escuela de quimica
../thesis_pdf_all/7460.pdf-----------------------------------| 23.1% Complete
escuela de biologia
../thesis_pdf_all/7489.pdf-----------------------------------| 23.3% Complete
escuela de quimica
../thesis_pdf_all/7490.pdf-----------------------------------| 23.6% Complete
escuela de quimica
../thesis_pdf_all/7491.pdf-----------------------------------| 23.8% Complete
escuela de quimica
../thesis_pdf_all/7506.pdf-----------------------------------| 24.1% Complete
escuela de computacion
../thesis_pdf_all/7518.pdf-----------------------------------| 24.4% Complete
escuela de computacion
../thesis_pdf_all/

escuela de geografia
../thesis_pdf_all/7701.pdf██████-----------------------------| 42.6% Complete
escuela de geografia
../thesis_pdf_all/7700.pdf██████-----------------------------| 42.8% Complete
escuela de geografia
../thesis_pdf_all/7698.pdf██████-----------------------------| 43.1% Complete
escuela de geografia
../thesis_pdf_all/7685.pdf██████-----------------------------| 43.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/7697.pdf██████-----------------------------| 43.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7696.pdf██████-----------------------------| 43.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7695.pdf███████----------------------------| 44.1% Complete
../thesis_pdf_all/7694.pdf███████----------------------------| 44.4% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/7693.pdf███████----------------------------| 44.6% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/7692.pdf███████---------------

escuela de psicologia
../thesis_pdf_all/7271.pdf████████████████-------------------| 63.1% Complete
escuela de psicologia
../thesis_pdf_all/7299.pdf████████████████-------------------| 63.3% Complete
escuela de psicologia
../thesis_pdf_all/7298.pdf████████████████-------------------| 63.6% Complete
escuela de psicologia
../thesis_pdf_all/7297.pdf████████████████-------------------| 63.8% Complete
escuela de psicologia
../thesis_pdf_all/7296.pdf█████████████████------------------| 64.1% Complete
escuela de psicologia
../thesis_pdf_all/7295.pdf█████████████████------------------| 64.4% Complete
escuela de psicologia
../thesis_pdf_all/7294.pdf█████████████████------------------| 64.6% Complete
escuela de educacion
../thesis_pdf_all/7293.pdf█████████████████------------------| 64.9% Complete
escuela de artes
../thesis_pdf_all/7292.pdf█████████████████------------------| 65.1% Complete
escuela de artes
../thesis_pdf_all/7291.pdf█████████████████------------------| 65.4% Complete
escuela de 

escuela de psicologia
../thesis_pdf_all/7218.pdf███████████████████████████--------| 84.1% Complete
escuela de psicologia
../thesis_pdf_all/7217.pdf███████████████████████████--------| 84.4% Complete
escuela de psicologia
../thesis_pdf_all/7216.pdf███████████████████████████--------| 84.6% Complete
escuela de psicologia
../thesis_pdf_all/7215.pdf███████████████████████████--------| 84.9% Complete
escuela de psicologia
../thesis_pdf_all/7214.pdf███████████████████████████--------| 85.1% Complete
escuela de psicologia
../thesis_pdf_all/7213.pdf███████████████████████████--------| 85.4% Complete
escuela de psicologia
../thesis_pdf_all/7212.pdf███████████████████████████--------| 85.6% Complete
escuela de psicologia
../thesis_pdf_all/7239.pdf███████████████████████████--------| 85.9% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/7240.pdf████████████████████████████-------| 86.2% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/7241.pdf██████████████████

In [14]:
#guarda las escuelas clasificadas en un csv
df_splited[5].to_csv("./data/splitted/thesis_df_splited5_with_school.csv", index=False)

In [15]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[6])
values_c_6 = check_vec("../"+df_splited[6]["path"],l)
df_splited[6]['school_simple'] = values_c_6.tolist()

../thesis_pdf_all/7401.pdf-----------------------------------| 0.3% Complete
escuela de psicologia
../thesis_pdf_all/7400.pdf-----------------------------------| 0.5% Complete
escuela de psicologia
../thesis_pdf_all/7399.pdf-----------------------------------| 0.8% Complete
escuela de psicologia
../thesis_pdf_all/7398.pdf-----------------------------------| 1.0% Complete
escuela de psicologia
../thesis_pdf_all/7397.pdf-----------------------------------| 1.3% Complete
escuela de psicologia
../thesis_pdf_all/7424.pdf-----------------------------------| 1.5% Complete
escuela de computacion
../thesis_pdf_all/7425.pdf-----------------------------------| 1.8% Complete
escuela de computacion
../thesis_pdf_all/7426.pdf-----------------------------------| 2.1% Complete
escuela de computacion
../thesis_pdf_all/7443.pdf-----------------------------------| 2.3% Complete
escuela de biologia
../thesis_pdf_all/7455.pdf-----------------------------------| 2.6% Complete
escuela de biologia
../thesis_p

escuela de psicologia
../thesis_pdf_all/7378.pdf-----------------------------------| 21.8% Complete
../thesis_pdf_all/7377.pdf-----------------------------------| 22.1% Complete
escuela de psicologia
../thesis_pdf_all/7376.pdf-----------------------------------| 22.3% Complete
escuela de psicologia
../thesis_pdf_all/7375.pdf-----------------------------------| 22.6% Complete
escuela de psicologia
../thesis_pdf_all/7374.pdf-----------------------------------| 22.8% Complete
escuela de psicologia
../thesis_pdf_all/7373.pdf-----------------------------------| 23.1% Complete
escuela de psicologia
../thesis_pdf_all/7372.pdf-----------------------------------| 23.3% Complete
escuela de psicologia
../thesis_pdf_all/7371.pdf-----------------------------------| 23.6% Complete
escuela de psicologia
../thesis_pdf_all/7370.pdf-----------------------------------| 23.8% Complete
escuela de psicologia
../thesis_pdf_all/7369.pdf-----------------------------------| 24.1% Complete
escuela de psicologia


escuela de trabajo social
../thesis_pdf_all/7990.pdf██████-----------------------------| 43.1% Complete
escuela de trabajo social
../thesis_pdf_all/7989.pdf██████-----------------------------| 43.3% Complete
escuela de trabajo social
../thesis_pdf_all/7988.pdf██████-----------------------------| 43.6% Complete
escuela de trabajo social
../thesis_pdf_all/7987.pdf██████-----------------------------| 43.8% Complete
escuela de trabajo social
../thesis_pdf_all/7986.pdf███████----------------------------| 44.1% Complete
escuela de biologia
../thesis_pdf_all/7985.pdf███████----------------------------| 44.4% Complete
escuela de biologia
../thesis_pdf_all/7984.pdf███████----------------------------| 44.6% Complete
escuela de biologia
../thesis_pdf_all/7983.pdf███████----------------------------| 44.9% Complete
escuela de biologia
../thesis_pdf_all/7982.pdf███████----------------------------| 45.1% Complete
escuela de biologia
../thesis_pdf_all/7981.pdf███████----------------------------| 45.4%

../thesis_pdf_all/8182.pdf█████████████████------------------| 64.4% Complete
../thesis_pdf_all/8183.pdf█████████████████------------------| 64.6% Complete
../thesis_pdf_all/8198.pdf█████████████████------------------| 64.9% Complete
escuela de artes
../thesis_pdf_all/8210.pdf█████████████████------------------| 65.1% Complete
escuela de sociologia
../thesis_pdf_all/8209.pdf█████████████████------------------| 65.4% Complete
escuela de sociologia
../thesis_pdf_all/8208.pdf█████████████████------------------| 65.6% Complete
escuela de sociologia
../thesis_pdf_all/8207.pdf█████████████████------------------| 65.9% Complete
../thesis_pdf_all/8206.pdf██████████████████-----------------| 66.2% Complete
../thesis_pdf_all/8205.pdf██████████████████-----------------| 66.4% Complete
escuela de agronomia
../thesis_pdf_all/8204.pdf██████████████████-----------------| 66.7% Complete
../thesis_pdf_all/8203.pdf██████████████████-----------------| 66.9% Complete
../thesis_pdf_all/8202.pdf████████████

escuela de ingenieria electrica
../thesis_pdf_all/8125.pdf████████████████████████████-------| 86.9% Complete
escuela de ingenieria electrica
../thesis_pdf_all/8124.pdf████████████████████████████-------| 87.2% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7969.pdf████████████████████████████-------| 87.4% Complete
escuela de trabajo social
../thesis_pdf_all/7968.pdf████████████████████████████-------| 87.7% Complete
escuela de biologia
../thesis_pdf_all/7967.pdf████████████████████████████-------| 87.9% Complete
escuela de biologia
../thesis_pdf_all/7776.pdf█████████████████████████████------| 88.2% Complete
escuela de psicologia
../thesis_pdf_all/7811.pdf█████████████████████████████------| 88.5% Complete
escuela de psicologia
../thesis_pdf_all/7810.pdf█████████████████████████████------| 88.7% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7809.pdf█████████████████████████████------| 89.0% Complete
escuela de psicologia
../thesis_pdf_all/7808.pdf████████████

In [16]:
df_splited[6].to_csv("./data/splitted/thesis_df_splited6_with_school.csv", index=False)

In [17]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[7])
values_c_7 = check_vec("../"+df_splited[7]["path"],l)
df_splited[7]['school_simple'] = values_c_7.tolist()

../thesis_pdf_all/7826.pdf-----------------------------------| 0.3% Complete
escuela de psicologia
../thesis_pdf_all/7825.pdf-----------------------------------| 0.5% Complete
escuela de psicologia
../thesis_pdf_all/7824.pdf-----------------------------------| 0.8% Complete
escuela de psicologia
../thesis_pdf_all/7823.pdf-----------------------------------| 1.0% Complete
escuela de psicologia
../thesis_pdf_all/7822.pdf-----------------------------------| 1.3% Complete
escuela de psicologia
../thesis_pdf_all/7821.pdf-----------------------------------| 1.5% Complete
escuela de psicologia
../thesis_pdf_all/7820.pdf-----------------------------------| 1.8% Complete
escuela de psicologia
../thesis_pdf_all/7819.pdf-----------------------------------| 2.1% Complete
escuela de psicologia
../thesis_pdf_all/7818.pdf-----------------------------------| 2.3% Complete
escuela de psicologia
../thesis_pdf_all/7817.pdf-----------------------------------| 2.6% Complete
escuela de psicologia
../thesis_

escuela de trabajo social
../thesis_pdf_all/7927.pdf-----------------------------------| 21.3% Complete
escuela de trabajo social
../thesis_pdf_all/7926.pdf-----------------------------------| 21.5% Complete
escuela de trabajo social
../thesis_pdf_all/7925.pdf-----------------------------------| 21.8% Complete
escuela de trabajo social
../thesis_pdf_all/7924.pdf-----------------------------------| 22.1% Complete
escuela de trabajo social
../thesis_pdf_all/7923.pdf-----------------------------------| 22.3% Complete
escuela de trabajo social
../thesis_pdf_all/7922.pdf-----------------------------------| 22.6% Complete
escuela de trabajo social
../thesis_pdf_all/7921.pdf-----------------------------------| 22.8% Complete
escuela de trabajo social
../thesis_pdf_all/7920.pdf-----------------------------------| 23.1% Complete
escuela de trabajo social
../thesis_pdf_all/7919.pdf-----------------------------------| 23.3% Complete
escuela de trabajo social
../thesis_pdf_all/7918.pdf------------

../thesis_pdf_all/7872.pdf██████-----------------------------| 42.1% Complete
escuela de trabajo social
../thesis_pdf_all/7874.pdf██████-----------------------------| 42.3% Complete
escuela de trabajo social
../thesis_pdf_all/7903.pdf██████-----------------------------| 42.6% Complete
escuela de trabajo social
../thesis_pdf_all/7889.pdf██████-----------------------------| 42.8% Complete
escuela de trabajo social
../thesis_pdf_all/7902.pdf██████-----------------------------| 43.1% Complete
escuela de trabajo social
../thesis_pdf_all/7901.pdf██████-----------------------------| 43.3% Complete
escuela de trabajo social
../thesis_pdf_all/7900.pdf██████-----------------------------| 43.6% Complete
escuela de trabajo social
../thesis_pdf_all/7899.pdf██████-----------------------------| 43.8% Complete
escuela de trabajo social
../thesis_pdf_all/7898.pdf███████----------------------------| 44.1% Complete
../thesis_pdf_all/7897.pdf███████----------------------------| 44.4% Complete
escuela de c

../thesis_pdf_all/6579.pdf████████████████-------------------| 62.8% Complete
escuela de computacion
../thesis_pdf_all/6578.pdf████████████████-------------------| 63.1% Complete
escuela de computacion
../thesis_pdf_all/6577.pdf████████████████-------------------| 63.3% Complete
escuela de computacion
../thesis_pdf_all/6576.pdf████████████████-------------------| 63.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6575.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6574.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6573.pdf█████████████████------------------| 64.4% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6572.pdf█████████████████------------------| 64.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6571.pdf█████████████████------------------| 64.9% Complete
escuela de computacion
../thesis_pdf_all/6570.pdf█████████████████---

escuela de geografia
../thesis_pdf_all/6686.pdf██████████████████████████---------| 82.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/6685.pdf██████████████████████████---------| 82.6% Complete
escuela de educacion
../thesis_pdf_all/6684.pdf██████████████████████████---------| 82.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6683.pdf██████████████████████████---------| 83.1% Complete
escuela de educacion
../thesis_pdf_all/6682.pdf██████████████████████████---------| 83.3% Complete
escuela de educacion
../thesis_pdf_all/6681.pdf██████████████████████████---------| 83.6% Complete
escuela de educacion
../thesis_pdf_all/6680.pdf██████████████████████████---------| 83.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/6679.pdf███████████████████████████--------| 84.1% Complete
escuela de educacion
../thesis_pdf_all/6678.pdf███████████████████████████--------| 84.4% Complete
escuela de educacion
../thesis_pdf_all/6677.pdf███████████████████████████

In [18]:
df_splited[7].to_csv("./data/splitted/thesis_df_splited7_with_school.csv", index=False)

In [19]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[8])
values_c_8 = check_vec("../"+df_splited[8]["path"],l)
df_splited[8]['school_simple'] = values_c_8.tolist()

../thesis_pdf_all/6616.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/6615.pdf-----------------------------------| 0.5% Complete
escuela de enfermeria
../thesis_pdf_all/6614.pdf-----------------------------------| 0.8% Complete
escuela de enfermeria
../thesis_pdf_all/6613.pdf-----------------------------------| 1.0% Complete
escuela de enfermeria
../thesis_pdf_all/6612.pdf-----------------------------------| 1.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6611.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6610.pdf-----------------------------------| 1.8% Complete
escuela de enfermeria
../thesis_pdf_all/6609.pdf-----------------------------------| 2.1% Complete
escuela de enfermeria
../thesis_pdf_all/6608.pdf-----------------------------------| 2.3% Complete
escuela de enfermeria
../thesis_pdf_all/6607.pdf-----------------------------------| 2.6% Complete
escuela de enfermeria
../thesis_pdf_

escuela de ingenieria quimica
../thesis_pdf_all/6320.pdf-----------------------------------| 20.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/6321.pdf-----------------------------------| 20.8% Complete
../thesis_pdf_all/6337.pdf-----------------------------------| 21.0% Complete
escuela de educacion
../thesis_pdf_all/6350.pdf-----------------------------------| 21.3% Complete
../thesis_pdf_all/6349.pdf-----------------------------------| 21.5% Complete
escuela de comunicacion social
../thesis_pdf_all/6347.pdf-----------------------------------| 21.8% Complete
escuela de educacion
../thesis_pdf_all/6346.pdf-----------------------------------| 22.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6345.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6344.pdf-----------------------------------| 22.6% Complete
../thesis_pdf_all/6343.pdf-----------------------------------| 22.8% Complete
escuel

escuela de educacion
../thesis_pdf_all/6273.pdf█████------------------------------| 40.5% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6272.pdf█████------------------------------| 40.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6271.pdf█████------------------------------| 41.0% Complete
escuela de educacion
../thesis_pdf_all/6270.pdf█████------------------------------| 41.3% Complete
escuela de educacion
../thesis_pdf_all/6269.pdf█████------------------------------| 41.5% Complete
escuela de educacion
../thesis_pdf_all/6268.pdf█████------------------------------| 41.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6267.pdf██████-----------------------------| 42.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/6266.pdf██████-----------------------------| 42.3% Complete
escuela de educacion
../thesis_pdf_all/6265.pdf██████-----------------------------| 42.6% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6264.pdf██████--

escuela de ingenieria quimica
../thesis_pdf_all/6412.pdf███████████████--------------------| 60.0% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6381.pdf███████████████--------------------| 60.3% Complete
escuela de educacion
../thesis_pdf_all/6380.pdf███████████████--------------------| 60.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6379.pdf███████████████--------------------| 60.8% Complete
escuela de educacion
../thesis_pdf_all/6378.pdf███████████████--------------------| 61.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6377.pdf███████████████--------------------| 61.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6376.pdf███████████████--------------------| 61.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/6375.pdf███████████████--------------------| 61.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/6374.pdf████████████████-------------------| 62.1% Complet

../thesis_pdf_all/7041.pdf████████████████████████-----------| 79.2% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7040.pdf████████████████████████-----------| 79.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7039.pdf████████████████████████-----------| 79.7% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7038.pdf█████████████████████████----------| 80.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7037.pdf█████████████████████████----------| 80.3% Complete
escuela de educacion
../thesis_pdf_all/7036.pdf█████████████████████████----------| 80.5% Complete
escuela de educacion
../thesis_pdf_all/7035.pdf█████████████████████████----------| 80.8% Complete
escuela de artes
../thesis_pdf_all/7034.pdf█████████████████████████----------| 81.0% Complete
escuela de artes
../thesis_pdf_all/7033.pdf█████████████████████████----------| 81.3% Complete
escuela de artes
../thesis_pdf_all/7032.pdf███████████

escuela de ingenieria electrica
../thesis_pdf_all/7023.pdf██████████████████████████████████-| 99.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7022.pdf██████████████████████████████████-| 99.7% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7021.pdf-----------------------------------| 0.0% Complete
escuela de bibliotecologia y archivologia


In [20]:
df_splited[8].to_csv("./data/splitted/thesis_df_splited8_with_school.csv", index=False)

In [21]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[9])
values_c_9 = check_vec("../"+df_splited[9]["path"],l)
df_splited[9]['school_simple'] = values_c_9.tolist()

../thesis_pdf_all/7020.pdf-----------------------------------| 0.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7019.pdf-----------------------------------| 0.5% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7018.pdf-----------------------------------| 0.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7017.pdf-----------------------------------| 1.0% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7016.pdf-----------------------------------| 1.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7015.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/7014.pdf-----------------------------------| 1.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7013.pdf-----------------------------------| 2.1% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/7012.pdf--------------------------------

escuela de psicologia
../thesis_pdf_all/7186.pdf-----------------------------------| 20.3% Complete
escuela de psicologia
../thesis_pdf_all/7185.pdf-----------------------------------| 20.5% Complete
escuela de psicologia
../thesis_pdf_all/7184.pdf-----------------------------------| 20.8% Complete
escuela de psicologia
../thesis_pdf_all/7183.pdf-----------------------------------| 21.0% Complete
escuela de trabajo social
../thesis_pdf_all/7182.pdf-----------------------------------| 21.3% Complete
escuela de psicologia
../thesis_pdf_all/7181.pdf-----------------------------------| 21.5% Complete
escuela de psicologia
../thesis_pdf_all/7149.pdf-----------------------------------| 21.8% Complete
escuela de psicologia
../thesis_pdf_all/7147.pdf-----------------------------------| 22.1% Complete
escuela de psicologia
../thesis_pdf_all/7088.pdf-----------------------------------| 22.3% Complete
escuela de psicologia
../thesis_pdf_all/7146.pdf-----------------------------------| 22.6% Compl

escuela de educacion
../thesis_pdf_all/6800.pdf█████------------------------------| 41.0% Complete
escuela de educacion
../thesis_pdf_all/6799.pdf█████------------------------------| 41.3% Complete
escuela de educacion
../thesis_pdf_all/6798.pdf█████------------------------------| 41.5% Complete
escuela de educacion
../thesis_pdf_all/6797.pdf█████------------------------------| 41.8% Complete
escuela de educacion
../thesis_pdf_all/6796.pdf██████-----------------------------| 42.1% Complete
escuela de educacion
../thesis_pdf_all/6795.pdf██████-----------------------------| 42.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6794.pdf██████-----------------------------| 42.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6793.pdf██████-----------------------------| 42.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6792.pdf██████-----------------------------| 43.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6791.pdf██████----------------

escuela de educacion
../thesis_pdf_all/6779.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6778.pdf████████████████-------------------| 62.3% Complete
escuela de educacion
../thesis_pdf_all/6777.pdf████████████████-------------------| 62.6% Complete
escuela de educacion
../thesis_pdf_all/6776.pdf████████████████-------------------| 62.8% Complete
escuela de educacion
../thesis_pdf_all/6775.pdf████████████████-------------------| 63.1% Complete
escuela de educacion
../thesis_pdf_all/6774.pdf████████████████-------------------| 63.3% Complete
escuela de educacion
../thesis_pdf_all/6773.pdf████████████████-------------------| 63.6% Complete
escuela de educacion
../thesis_pdf_all/6772.pdf████████████████-------------------| 63.8% Complete
escuela de educacion
../thesis_pdf_all/6771.pdf█████████████████------------------| 64.1% Complete
escuela de educacion
../thesis_pdf_all/6770.pdf█████████████████------------------| 64.4% Complete


escuela de ingenieria mecanica
../thesis_pdf_all/6943.pdf██████████████████████████---------| 82.3% Complete
../thesis_pdf_all/6942.pdf██████████████████████████---------| 82.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6941.pdf██████████████████████████---------| 82.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6940.pdf██████████████████████████---------| 83.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6939.pdf██████████████████████████---------| 83.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6938.pdf██████████████████████████---------| 83.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6937.pdf██████████████████████████---------| 83.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6936.pdf███████████████████████████--------| 84.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6904.pdf███████████████████████████--------| 84.4% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/6

In [22]:
df_splited[9].to_csv("./data/splitted/thesis_df_splited9_with_school.csv", index=False)

In [23]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[10])
values_c_10 = check_vec("../"+df_splited[10]["path"],l)
df_splited[10]['school_simple'] = values_c_10.tolist()

../thesis_pdf_all/2.pdf--------------------------------------| 0.3% Complete
escuela de economia
../thesis_pdf_all/1597.pdf-----------------------------------| 0.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1626.pdf-----------------------------------| 0.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1625.pdf-----------------------------------| 1.0% Complete
escuela de comunicacion social
../thesis_pdf_all/1624.pdf-----------------------------------| 1.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1623.pdf-----------------------------------| 1.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1622.pdf-----------------------------------| 1.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1621.pdf-----------------------------------| 2.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1620.pdf-----------------------------------| 2.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1619.pdf-------------

escuela de comunicacion social
../thesis_pdf_all/1549.pdf-----------------------------------| 20.3% Complete
escuela de enfermeria
../thesis_pdf_all/1548.pdf-----------------------------------| 20.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1547.pdf-----------------------------------| 20.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1546.pdf-----------------------------------| 21.0% Complete
escuela de comunicacion social
../thesis_pdf_all/1545.pdf-----------------------------------| 21.3% Complete
../thesis_pdf_all/1544.pdf-----------------------------------| 21.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1543.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1542.pdf-----------------------------------| 22.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1541.pdf-----------------------------------| 22.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1540.pdf--

escuela de ingenieria mecanica
../thesis_pdf_all/1779.pdf█████------------------------------| 40.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1778.pdf█████------------------------------| 40.3% Complete
escuela de enfermeria
../thesis_pdf_all/1777.pdf█████------------------------------| 40.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1776.pdf█████------------------------------| 40.8% Complete
escuela de enfermeria
../thesis_pdf_all/1775.pdf█████------------------------------| 41.0% Complete
escuela de enfermeria
../thesis_pdf_all/1774.pdf█████------------------------------| 41.3% Complete
escuela de enfermeria
../thesis_pdf_all/1773.pdf█████------------------------------| 41.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1772.pdf█████------------------------------| 41.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1771.pdf██████-----------------------------| 42.1% Complete
escuela de enfermeria
../thesis_pdf_all/1770.p

escuela de ingenieria mecanica
../thesis_pdf_all/1700.pdf███████████████--------------------| 60.0% Complete
escuela de educacion
../thesis_pdf_all/1699.pdf███████████████--------------------| 60.3% Complete
escuela de educacion
../thesis_pdf_all/1698.pdf███████████████--------------------| 60.5% Complete
escuela de educacion
../thesis_pdf_all/1697.pdf███████████████--------------------| 60.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1696.pdf███████████████--------------------| 61.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1695.pdf███████████████--------------------| 61.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1694.pdf███████████████--------------------| 61.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1693.pdf███████████████--------------------| 61.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1692.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria de petroleo


escuela de geologia, minas y geofisica
../thesis_pdf_all/1317.pdf████████████████████████-----------| 79.5% Complete
../thesis_pdf_all/1316.pdf████████████████████████-----------| 79.7% Complete
escuela de ingenieria civil
../thesis_pdf_all/1315.pdf█████████████████████████----------| 80.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1314.pdf█████████████████████████----------| 80.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1313.pdf█████████████████████████----------| 80.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1312.pdf█████████████████████████----------| 80.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1311.pdf█████████████████████████----------| 81.0% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/1310.pdf█████████████████████████----------| 81.3% Complete
escuela de ingenieria quimica
../thesis_pdf_all/1309.pdf█████████████████████████----------| 81.5% Complete
escuela de ingenieria de petro

../thesis_pdf_all/1483.pdf██████████████████████████████████-| 99.7% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1482.pdf-----------------------------------| 0.0% Complete


In [24]:
df_splited[10].to_csv("./data/splitted/thesis_df_splited10_with_school.csv", index=False)

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[11])
values_c_11 = check_vec("../"+df_splited[11]["path"],l)
df_splited[11]['school_simple'] = values_c_11.tolist()

In [None]:
df_splited[11].to_csv("./data/splitted/thesis_df_splited11_with_school.csv", index=False)

In [25]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[12])
values_c_12 = check_vec("../"+df_splited[12]["path"],l)
df_splited[12]['school_simple'] = values_c_12.tolist()

../thesis_pdf_all/1886.pdf-----------------------------------| 0.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/1885.pdf-----------------------------------| 0.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1884.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/1883.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1882.pdf-----------------------------------| 1.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1881.pdf-----------------------------------| 1.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/1880.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1879.pdf-----------------------------------| 2.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/1878.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/1877.pdf------------------------------

../thesis_pdf_all/1989.pdf-----------------------------------| 21.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1988.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1987.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1986.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1985.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1984.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1983.pdf-----------------------------------| 22.6% Complete
escuela de enfermeria
../thesis_pdf_all/1982.pdf-----------------------------------| 22.8% Complete
escuela de enfermeria
../thesis_pdf_all/1981.pdf-----------------------------------| 23.1% Complete
escuela de ingenieria quimica
../thesis_pdf_all/19

escuela de ingenieria de petroleo
../thesis_pdf_all/1910.pdf█████------------------------------| 41.0% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1909.pdf█████------------------------------| 41.3% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1908.pdf█████------------------------------| 41.5% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1907.pdf█████------------------------------| 41.8% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/1906.pdf██████-----------------------------| 42.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/1933.pdf██████-----------------------------| 42.3% Complete
escuela de artes
../thesis_pdf_all/1934.pdf██████-----------------------------| 42.6% Complete
escuela de educacion
../thesis_pdf_all/1935.pdf██████-----------------------------| 42.8% Complete
escuela de artes
../thesis_pdf_all/1950.pdf██████-----------------------------| 43.1% Complete
../thesis_pdf_all/1963.pdf██████---------

../thesis_pdf_all/528.pdf████████████████--------------------| 61.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/527.pdf████████████████--------------------| 61.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/526.pdf████████████████--------------------| 61.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/524.pdf█████████████████-------------------| 62.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/511.pdf█████████████████-------------------| 62.3% Complete
../thesis_pdf_all/523.pdf█████████████████-------------------| 62.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/522.pdf█████████████████-------------------| 62.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/521.pdf█████████████████-------------------| 63.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/520.pdf█████████████████-------------------| 63.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/519.pdf█████████████████

escuela de ingenieria quimica
../thesis_pdf_all/599.pdf██████████████████████████----------| 81.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/636.pdf██████████████████████████----------| 81.5% Complete
escuela de comunicacion social
../thesis_pdf_all/295.pdf██████████████████████████----------| 81.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/704.pdf███████████████████████████---------| 82.1% Complete
escuela de enfermeria
../thesis_pdf_all/748.pdf███████████████████████████---------| 82.3% Complete
escuela de comunicacion social
../thesis_pdf_all/743.pdf███████████████████████████---------| 82.6% Complete
escuela de comunicacion social
../thesis_pdf_all/742.pdf███████████████████████████---------| 82.8% Complete
escuela de comunicacion social
../thesis_pdf_all/741.pdf███████████████████████████---------| 83.1% Complete
escuela de comunicacion social
../thesis_pdf_all/740.pdf███████████████████████████---------| 83.3% Complete
escuela de comunicacion 

In [26]:
df_splited[12].to_csv("./data/splitted/thesis_df_splited12_with_school.csv", index=False)

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[13])
values_c_13 = check_vec("../"+df_splited[13]["path"],l)
df_splited[13]['school_simple'] = values_c_13.tolist()

In [None]:
df_splited[13].to_csv("./data/splitted/thesis_df_splited13_with_school.csv", index=False)

In [27]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[14])
values_c_14 = check_vec("../"+df_splited[14]["path"],l)
df_splited[14]['school_simple'] = values_c_14.tolist()

../thesis_pdf_all/1099.pdf-----------------------------------| 0.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1098.pdf-----------------------------------| 0.5% Complete
../thesis_pdf_all/1097.pdf-----------------------------------| 0.8% Complete
escuela de artes
../thesis_pdf_all/1096.pdf-----------------------------------| 1.0% Complete
escuela de artes
../thesis_pdf_all/1095.pdf-----------------------------------| 1.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1094.pdf-----------------------------------| 1.5% Complete
../thesis_pdf_all/1093.pdf-----------------------------------| 1.8% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1092.pdf-----------------------------------| 2.1% Complete
escuela de artes
../thesis_pdf_all/1091.pdf-----------------------------------| 2.3% Complete
escuela de bibliotecologia y archivologia
../thesis_pdf_all/1090.pdf-----------------------------------| 2.6% Complete
escuela de 

escuela de ingenieria de petroleo
../thesis_pdf_all/1267.pdf-----------------------------------| 20.8% Complete
escuela de enfermeria
../thesis_pdf_all/1266.pdf-----------------------------------| 21.0% Complete
escuela de enfermeria
../thesis_pdf_all/1265.pdf-----------------------------------| 21.3% Complete
escuela de enfermeria
../thesis_pdf_all/1264.pdf-----------------------------------| 21.5% Complete
escuela de enfermeria
../thesis_pdf_all/1263.pdf-----------------------------------| 21.8% Complete
escuela de enfermeria
../thesis_pdf_all/1229.pdf-----------------------------------| 22.1% Complete
escuela de enfermeria
../thesis_pdf_all/1227.pdf-----------------------------------| 22.3% Complete
escuela de enfermeria
../thesis_pdf_all/1167.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/1226.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all

../thesis_pdf_all/877.pdf██████------------------------------| 41.0% Complete
../thesis_pdf_all/876.pdf██████------------------------------| 41.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/875.pdf██████------------------------------| 41.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/874.pdf██████------------------------------| 41.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/873.pdf███████-----------------------------| 42.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/872.pdf███████-----------------------------| 42.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/871.pdf███████-----------------------------| 42.6% Complete
../thesis_pdf_all/870.pdf███████-----------------------------| 42.8% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/869.pdf███████-----------------------------| 43.1% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_

../thesis_pdf_all/827.pdf████████████████--------------------| 61.5% Complete
escuela de ingenieria electrica
../thesis_pdf_all/828.pdf████████████████--------------------| 61.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/843.pdf█████████████████-------------------| 62.1% Complete
escuela de enfermeria
../thesis_pdf_all/855.pdf█████████████████-------------------| 62.3% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/854.pdf█████████████████-------------------| 62.6% Complete
escuela de enfermeria
../thesis_pdf_all/853.pdf█████████████████-------------------| 62.8% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/852.pdf█████████████████-------------------| 63.1% Complete
escuela de enfermeria
../thesis_pdf_all/851.pdf█████████████████-------------------| 63.3% Complete
escuela de enfermeria
../thesis_pdf_all/850.pdf█████████████████-------------------| 63.6% Complete
escuela de ingenieria 

escuela de comunicacion social
../thesis_pdf_all/1028.pdf█████████████████████████----------| 80.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1026.pdf█████████████████████████----------| 81.0% Complete
escuela de comunicacion social
../thesis_pdf_all/1013.pdf█████████████████████████----------| 81.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1025.pdf█████████████████████████----------| 81.5% Complete
escuela de comunicacion social
../thesis_pdf_all/1024.pdf█████████████████████████----------| 81.8% Complete
../thesis_pdf_all/1023.pdf██████████████████████████---------| 82.1% Complete
escuela de comunicacion social
../thesis_pdf_all/1022.pdf██████████████████████████---------| 82.3% Complete
escuela de comunicacion social
../thesis_pdf_all/1021.pdf██████████████████████████---------| 82.6% Complete
escuela de comunicacion social
../thesis_pdf_all/1020.pdf██████████████████████████---------| 82.8% Complete
escuela de comunicacion social
../thesis_pdf_all/1

escuela de educacion


In [28]:
df_splited[14].to_csv("./data/splitted/thesis_df_splited14_with_school.csv", index=False)

In [29]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[15])
values_c_15 = check_vec("../"+df_splited[15]["path"],l)
df_splited[15]['school_simple'] = values_c_15.tolist()

../thesis_pdf_all/2273.pdf-----------------------------------| 0.3% Complete
escuela de educacion
../thesis_pdf_all/2274.pdf-----------------------------------| 0.5% Complete
escuela de educacion
../thesis_pdf_all/3564.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3592.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria civil
../thesis_pdf_all/3591.pdf-----------------------------------| 1.3% Complete
../thesis_pdf_all/3590.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/3589.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3588.pdf-----------------------------------| 2.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3587.pdf-----------------------------------| 2.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3586.pdf-----------------------------------| 2.6% Complete
escuela de ingenieri

escuela de ingenieria civil
../thesis_pdf_all/3516.pdf-----------------------------------| 20.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/3515.pdf-----------------------------------| 20.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3514.pdf-----------------------------------| 21.0% Complete
escuela de ingenieria civil
../thesis_pdf_all/3513.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3512.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria civil
../thesis_pdf_all/3511.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3510.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3509.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3508.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/

../thesis_pdf_all/3739.pdf█████------------------------------| 41.5% Complete
escuela de quimica
../thesis_pdf_all/3738.pdf█████------------------------------| 41.8% Complete
escuela de quimica
../thesis_pdf_all/3737.pdf██████-----------------------------| 42.1% Complete
escuela de quimica
../thesis_pdf_all/3736.pdf██████-----------------------------| 42.3% Complete
escuela de quimica
../thesis_pdf_all/3735.pdf██████-----------------------------| 42.6% Complete
../thesis_pdf_all/3734.pdf██████-----------------------------| 42.8% Complete
escuela de quimica
../thesis_pdf_all/3733.pdf██████-----------------------------| 43.1% Complete
escuela de quimica
../thesis_pdf_all/3731.pdf██████-----------------------------| 43.3% Complete
escuela de quimica
../thesis_pdf_all/3718.pdf██████-----------------------------| 43.6% Complete
../thesis_pdf_all/3730.pdf██████-----------------------------| 43.8% Complete
../thesis_pdf_all/3729.pdf███████----------------------------| 44.1% Complete
../thesis

../thesis_pdf_all/3347.pdf████████████████-------------------| 63.8% Complete
escuela de enfermeria
../thesis_pdf_all/3346.pdf█████████████████------------------| 64.1% Complete
escuela de artes
../thesis_pdf_all/3345.pdf█████████████████------------------| 64.4% Complete
../thesis_pdf_all/3344.pdf█████████████████------------------| 64.6% Complete
escuela de comunicacion social
../thesis_pdf_all/3343.pdf█████████████████------------------| 64.9% Complete
escuela de comunicacion social
../thesis_pdf_all/3342.pdf█████████████████------------------| 65.1% Complete
escuela de artes
../thesis_pdf_all/3341.pdf█████████████████------------------| 65.4% Complete
escuela de artes
../thesis_pdf_all/3340.pdf█████████████████------------------| 65.6% Complete
escuela de artes
../thesis_pdf_all/3339.pdf█████████████████------------------| 65.9% Complete
escuela de artes
../thesis_pdf_all/3338.pdf██████████████████-----------------| 66.2% Complete
escuela de artes
../thesis_pdf_all/3337.pdf████████

../thesis_pdf_all/3262.pdf███████████████████████████--------| 85.1% Complete
escuela de biologia
../thesis_pdf_all/3261.pdf███████████████████████████--------| 85.4% Complete
escuela de biologia
../thesis_pdf_all/3260.pdf███████████████████████████--------| 85.6% Complete
escuela de biologia
../thesis_pdf_all/3259.pdf███████████████████████████--------| 85.9% Complete
escuela de biologia
../thesis_pdf_all/3288.pdf████████████████████████████-------| 86.2% Complete
escuela de quimica
../thesis_pdf_all/3289.pdf████████████████████████████-------| 86.4% Complete
escuela de quimica
../thesis_pdf_all/3290.pdf████████████████████████████-------| 86.7% Complete
escuela de fisica
../thesis_pdf_all/3305.pdf████████████████████████████-------| 86.9% Complete
../thesis_pdf_all/3317.pdf████████████████████████████-------| 87.2% Complete
escuela de quimica
../thesis_pdf_all/3316.pdf████████████████████████████-------| 87.4% Complete
escuela de fisica
../thesis_pdf_all/3315.pdf█████████████████████

In [30]:
df_splited[15].to_csv("./data/splitted/thesis_df_splited15_with_school.csv", index=False)

In [31]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[16])
values_c_16 = check_vec("../"+df_splited[16]["path"],l)
df_splited[16]['school_simple'] = values_c_16.tolist()

../thesis_pdf_all/3449.pdf-----------------------------------| 0.3% Complete
escuela de quimica
../thesis_pdf_all/3448.pdf-----------------------------------| 0.5% Complete
escuela de fisica
../thesis_pdf_all/3447.pdf-----------------------------------| 0.8% Complete
escuela de fisica
../thesis_pdf_all/3446.pdf-----------------------------------| 1.0% Complete
escuela de comunicacion social
../thesis_pdf_all/3445.pdf-----------------------------------| 1.3% Complete
escuela de comunicacion social
../thesis_pdf_all/3444.pdf-----------------------------------| 1.5% Complete
../thesis_pdf_all/3471.pdf-----------------------------------| 1.8% Complete
escuela de fisica
../thesis_pdf_all/3472.pdf-----------------------------------| 2.1% Complete
escuela de computacion
../thesis_pdf_all/3473.pdf-----------------------------------| 2.3% Complete
escuela de quimica
../thesis_pdf_all/3488.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3500.pdf--------

escuela de fisica
../thesis_pdf_all/3426.pdf-----------------------------------| 21.5% Complete
escuela de quimica
../thesis_pdf_all/3413.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria civil
../thesis_pdf_all/3425.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3424.pdf-----------------------------------| 22.3% Complete
escuela de fisica
../thesis_pdf_all/3423.pdf-----------------------------------| 22.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/3422.pdf-----------------------------------| 22.8% Complete
../thesis_pdf_all/3421.pdf-----------------------------------| 23.1% Complete
escuela de ingenieria civil
../thesis_pdf_all/3420.pdf-----------------------------------| 23.3% Complete
escuela de ingenieria civil
../thesis_pdf_all/3419.pdf-----------------------------------| 23.6% Complete
escuela de ingenieria civil
../thesis_pdf_all/3418.pdf-----------------------------------| 23.8% Comple

escuela de ingenieria electrica
../thesis_pdf_all/4018.pdf██████-----------------------------| 42.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4017.pdf██████-----------------------------| 42.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4016.pdf██████-----------------------------| 42.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4015.pdf██████-----------------------------| 42.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4014.pdf██████-----------------------------| 43.1% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4013.pdf██████-----------------------------| 43.3% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4012.pdf██████-----------------------------| 43.6% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4011.pdf██████-----------------------------| 43.8% Complete
escuela de ingenieria electrica
../thesis_pdf_all/4010.pdf███████----------------------------| 44.1% Complete
escuela de

../thesis_pdf_all/4190.pdf████████████████-------------------| 62.1% Complete
escuela de biologia
../thesis_pdf_all/4189.pdf████████████████-------------------| 62.3% Complete
escuela de biologia
../thesis_pdf_all/4188.pdf████████████████-------------------| 62.6% Complete
../thesis_pdf_all/4187.pdf████████████████-------------------| 62.8% Complete
escuela de biologia
../thesis_pdf_all/4186.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/4185.pdf████████████████-------------------| 63.3% Complete
escuela de biologia
../thesis_pdf_all/4184.pdf████████████████-------------------| 63.6% Complete
escuela de quimica
../thesis_pdf_all/4183.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/4182.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria de petroleo
../thesis_pdf_all/4210.pdf█████████████████------------------| 64.4% Complete
escuela de biologia
../

../thesis_pdf_all/4167.pdf██████████████████████████---------| 83.3% Complete
../thesis_pdf_all/4166.pdf██████████████████████████---------| 83.6% Complete
escuela de comunicacion social
../thesis_pdf_all/4165.pdf██████████████████████████---------| 83.8% Complete
escuela de comunicacion social
../thesis_pdf_all/4163.pdf███████████████████████████--------| 84.1% Complete
escuela de comunicacion social
../thesis_pdf_all/4150.pdf███████████████████████████--------| 84.4% Complete
escuela de comunicacion social
../thesis_pdf_all/4162.pdf███████████████████████████--------| 84.6% Complete
escuela de comunicacion social
../thesis_pdf_all/4161.pdf███████████████████████████--------| 84.9% Complete
escuela de comunicacion social
../thesis_pdf_all/4160.pdf███████████████████████████--------| 85.1% Complete
../thesis_pdf_all/4159.pdf███████████████████████████--------| 85.4% Complete
escuela de comunicacion social
../thesis_pdf_all/4158.pdf███████████████████████████--------| 85.6% Complete
esc

In [32]:
df_splited[16].to_csv("./data/splitted/thesis_df_splited16_with_school.csv", index=False)

In [33]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[17])
values_c_17 = check_vec("../"+df_splited[17]["path"],l)
df_splited[17]['school_simple'] = values_c_17.tolist()

../thesis_pdf_all/3852.pdf-----------------------------------| 0.3% Complete
escuela de quimica
../thesis_pdf_all/3851.pdf-----------------------------------| 0.5% Complete
escuela de fisica
../thesis_pdf_all/3850.pdf-----------------------------------| 0.8% Complete
escuela de matematica
../thesis_pdf_all/3849.pdf-----------------------------------| 1.0% Complete
escuela de matematica
../thesis_pdf_all/3848.pdf-----------------------------------| 1.3% Complete
escuela de computacion
../thesis_pdf_all/3847.pdf-----------------------------------| 1.5% Complete
escuela de computacion
../thesis_pdf_all/3846.pdf-----------------------------------| 1.8% Complete
escuela de matematica
../thesis_pdf_all/3845.pdf-----------------------------------| 2.1% Complete
escuela de matematica
../thesis_pdf_all/3844.pdf-----------------------------------| 2.3% Complete
escuela de fisica
../thesis_pdf_all/3843.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3842

escuela de geologia, minas y geofisica
../thesis_pdf_all/3951.pdf-----------------------------------| 21.5% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3950.pdf-----------------------------------| 21.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3949.pdf-----------------------------------| 22.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3948.pdf-----------------------------------| 22.3% Complete
escuela de biologia
../thesis_pdf_all/3947.pdf-----------------------------------| 22.6% Complete
escuela de biologia
../thesis_pdf_all/3946.pdf-----------------------------------| 22.8% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3945.pdf-----------------------------------| 23.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3944.pdf-----------------------------------| 23.3% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/3943.pdf-----------------------------------| 

../thesis_pdf_all/3871.pdf██████-----------------------------| 42.1% Complete
escuela de quimica
../thesis_pdf_all/3897.pdf██████-----------------------------| 42.3% Complete
escuela de matematica
../thesis_pdf_all/3899.pdf██████-----------------------------| 42.6% Complete
escuela de fisica
../thesis_pdf_all/3927.pdf██████-----------------------------| 42.8% Complete
escuela de matematica
../thesis_pdf_all/3914.pdf██████-----------------------------| 43.1% Complete
escuela de matematica
../thesis_pdf_all/3926.pdf██████-----------------------------| 43.3% Complete
escuela de quimica
../thesis_pdf_all/3925.pdf██████-----------------------------| 43.6% Complete
escuela de quimica
../thesis_pdf_all/3924.pdf██████-----------------------------| 43.8% Complete
escuela de quimica
../thesis_pdf_all/3923.pdf███████----------------------------| 44.1% Complete
escuela de matematica
../thesis_pdf_all/3922.pdf███████----------------------------| 44.4% Complete
escuela de matematica
../thesis_pdf_al

escuela de ingenieria mecanica
../thesis_pdf_all/2624.pdf████████████████-------------------| 62.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2623.pdf████████████████-------------------| 62.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2622.pdf████████████████-------------------| 63.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2621.pdf████████████████-------------------| 63.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2620.pdf████████████████-------------------| 63.6% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2619.pdf████████████████-------------------| 63.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2618.pdf█████████████████------------------| 64.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2617.pdf█████████████████------------------| 64.4% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2616.pdf█████████████████------------------| 64.6% Complete
../thesis_pdf_all/2

escuela de enfermeria
../thesis_pdf_all/2733.pdf██████████████████████████---------| 82.6% Complete
escuela de enfermeria
../thesis_pdf_all/2732.pdf██████████████████████████---------| 82.8% Complete
escuela de enfermeria
../thesis_pdf_all/2731.pdf██████████████████████████---------| 83.1% Complete
escuela de enfermeria
../thesis_pdf_all/2730.pdf██████████████████████████---------| 83.3% Complete
escuela de enfermeria
../thesis_pdf_all/2729.pdf██████████████████████████---------| 83.6% Complete
escuela de enfermeria
../thesis_pdf_all/2728.pdf██████████████████████████---------| 83.8% Complete
escuela de computacion
../thesis_pdf_all/2727.pdf███████████████████████████--------| 84.1% Complete
escuela de geologia, minas y geofisica
../thesis_pdf_all/2726.pdf███████████████████████████--------| 84.4% Complete
escuela de computacion
../thesis_pdf_all/2725.pdf███████████████████████████--------| 84.6% Complete
escuela de computacion
../thesis_pdf_all/2724.pdf███████████████████████████-----

In [34]:
df_splited[17].to_csv("./data/splitted/thesis_df_splited17_with_school.csv", index=False)

In [35]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[18])
values_c_18 = check_vec("../"+df_splited[18]["path"],l)
df_splited[18]['school_simple'] = values_c_18.tolist()

../thesis_pdf_all/2660.pdf-----------------------------------| 0.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2659.pdf-----------------------------------| 0.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2658.pdf-----------------------------------| 0.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2657.pdf-----------------------------------| 1.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2656.pdf-----------------------------------| 1.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2655.pdf-----------------------------------| 1.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2654.pdf-----------------------------------| 1.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2653.pdf-----------------------------------| 2.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2652.pdf-----------------------------------| 2.3% Complete
../thesis_pdf_all/2651.pdf---------------------------------

../thesis_pdf_all/2365.pdf-----------------------------------| 20.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2366.pdf-----------------------------------| 20.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2367.pdf-----------------------------------| 21.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2382.pdf-----------------------------------| 21.3% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/2394.pdf-----------------------------------| 21.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2393.pdf-----------------------------------| 21.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2392.pdf-----------------------------------| 22.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2391.pdf-----------------------------------| 22.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2390.pdf-----------------------------------| 22.6% Complete
escuela de ingenier

../thesis_pdf_all/2307.pdf█████------------------------------| 40.5% Complete
escuela de enfermeria
../thesis_pdf_all/2319.pdf█████------------------------------| 40.8% Complete
escuela de enfermeria
../thesis_pdf_all/2318.pdf█████------------------------------| 41.0% Complete
escuela de enfermeria
../thesis_pdf_all/2317.pdf█████------------------------------| 41.3% Complete
escuela de enfermeria
../thesis_pdf_all/2316.pdf█████------------------------------| 41.5% Complete
escuela de enfermeria
../thesis_pdf_all/2315.pdf█████------------------------------| 41.8% Complete
escuela de enfermeria
../thesis_pdf_all/2314.pdf██████-----------------------------| 42.1% Complete
escuela de enfermeria
../thesis_pdf_all/2313.pdf██████-----------------------------| 42.3% Complete
escuela de enfermeria
../thesis_pdf_all/2312.pdf██████-----------------------------| 42.6% Complete
escuela de enfermeria
../thesis_pdf_all/2311.pdf██████-----------------------------| 42.8% Complete
escuela de educacion
.

escuela de ingenieria mecanica
../thesis_pdf_all/2424.pdf███████████████--------------------| 60.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2423.pdf███████████████--------------------| 61.0% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2422.pdf███████████████--------------------| 61.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2421.pdf███████████████--------------------| 61.5% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2420.pdf███████████████--------------------| 61.8% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2419.pdf████████████████-------------------| 62.1% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2418.pdf████████████████-------------------| 62.3% Complete
escuela de ingenieria mecanica
../thesis_pdf_all/2417.pdf████████████████-------------------| 62.6% Complete
escuela de ingenieria metalurgica y ciencia de los materiales
../thesis_pdf_all/2416.pdf████████████████-------------------| 62.

../thesis_pdf_all/3085.pdf█████████████████████████----------| 80.5% Complete
escuela de computacion
../thesis_pdf_all/3084.pdf█████████████████████████----------| 80.8% Complete
escuela de quimica
../thesis_pdf_all/3083.pdf█████████████████████████----------| 81.0% Complete
../thesis_pdf_all/3082.pdf█████████████████████████----------| 81.3% Complete
escuela de quimica
../thesis_pdf_all/3081.pdf█████████████████████████----------| 81.5% Complete
escuela de biologia
../thesis_pdf_all/3080.pdf█████████████████████████----------| 81.8% Complete
escuela de quimica
../thesis_pdf_all/3079.pdf██████████████████████████---------| 82.1% Complete
escuela de biologia
../thesis_pdf_all/3078.pdf██████████████████████████---------| 82.3% Complete
escuela de biologia
../thesis_pdf_all/3077.pdf██████████████████████████---------| 82.6% Complete
escuela de quimica
../thesis_pdf_all/3076.pdf██████████████████████████---------| 82.8% Complete
escuela de biologia
../thesis_pdf_all/3103.pdf███████████████

In [36]:
df_splited[18].to_csv("./data/splitted/thesis_df_splited18_with_school.csv", index=False)

In [37]:
#create a list of values with the school column, the l is the total ammount of thesis to check
l = len(df_splited[19])
values_c_19 = check_vec("../"+df_splited[19]["path"],l)
df_splited[19]['school_simple'] = values_c_19.tolist()

../thesis_pdf_all/3068.pdf-----------------------------------| 0.3% Complete
../thesis_pdf_all/3067.pdf-----------------------------------| 0.5% Complete
escuela de biologia
../thesis_pdf_all/3066.pdf-----------------------------------| 0.8% Complete
escuela de biologia
../thesis_pdf_all/3065.pdf-----------------------------------| 1.0% Complete
escuela de biologia
../thesis_pdf_all/3064.pdf-----------------------------------| 1.3% Complete
escuela de biologia
../thesis_pdf_all/3063.pdf-----------------------------------| 1.5% Complete
escuela de biologia
../thesis_pdf_all/3062.pdf-----------------------------------| 1.8% Complete
escuela de biologia
../thesis_pdf_all/3061.pdf-----------------------------------| 2.1% Complete
escuela de quimica
../thesis_pdf_all/3060.pdf-----------------------------------| 2.3% Complete
escuela de biologia
../thesis_pdf_all/3059.pdf-----------------------------------| 2.6% Complete
escuela de quimica
../thesis_pdf_all/3057.pdf--------------------------

../thesis_pdf_all/3197.pdf-----------------------------------| 22.1% Complete
escuela de quimica
../thesis_pdf_all/3195.pdf-----------------------------------| 22.3% Complete
escuela de quimica
../thesis_pdf_all/3136.pdf-----------------------------------| 22.6% Complete
escuela de quimica
../thesis_pdf_all/3194.pdf-----------------------------------| 22.8% Complete
escuela de ingenieria quimica
../thesis_pdf_all/3163.pdf-----------------------------------| 23.1% Complete
escuela de biologia
../thesis_pdf_all/3162.pdf-----------------------------------| 23.3% Complete
escuela de biologia
../thesis_pdf_all/3161.pdf-----------------------------------| 23.6% Complete
escuela de biologia
../thesis_pdf_all/3160.pdf-----------------------------------| 23.8% Complete
escuela de quimica
../thesis_pdf_all/3159.pdf-----------------------------------| 24.1% Complete
escuela de quimica
../thesis_pdf_all/3158.pdf-----------------------------------| 24.4% Complete
escuela de fisica
../thesis_pdf_all

escuela de fisica
../thesis_pdf_all/2838.pdf██████-----------------------------| 43.3% Complete
escuela de computacion
../thesis_pdf_all/2837.pdf██████-----------------------------| 43.6% Complete
escuela de computacion
../thesis_pdf_all/2836.pdf██████-----------------------------| 43.8% Complete
escuela de computacion
../thesis_pdf_all/2835.pdf███████----------------------------| 44.1% Complete
escuela de computacion
../thesis_pdf_all/2834.pdf███████----------------------------| 44.4% Complete
escuela de computacion
../thesis_pdf_all/2833.pdf███████----------------------------| 44.6% Complete
escuela de computacion
../thesis_pdf_all/2832.pdf███████----------------------------| 44.9% Complete
escuela de computacion
../thesis_pdf_all/2831.pdf███████----------------------------| 45.1% Complete
escuela de computacion
../thesis_pdf_all/2830.pdf███████----------------------------| 45.4% Complete
escuela de quimica
../thesis_pdf_all/2857.pdf███████----------------------------| 45.6% Complete

../thesis_pdf_all/2816.pdf█████████████████------------------| 64.6% Complete
escuela de computacion
../thesis_pdf_all/2815.pdf█████████████████------------------| 64.9% Complete
escuela de biologia
../thesis_pdf_all/2814.pdf█████████████████------------------| 65.1% Complete
escuela de computacion
../thesis_pdf_all/2812.pdf█████████████████------------------| 65.4% Complete
escuela de biologia
../thesis_pdf_all/2799.pdf█████████████████------------------| 65.6% Complete
escuela de computacion
../thesis_pdf_all/2811.pdf█████████████████------------------| 65.9% Complete
escuela de computacion
../thesis_pdf_all/2810.pdf██████████████████-----------------| 66.2% Complete
escuela de computacion
../thesis_pdf_all/2809.pdf██████████████████-----------------| 66.4% Complete
escuela de computacion
../thesis_pdf_all/2808.pdf██████████████████-----------------| 66.7% Complete
escuela de computacion
../thesis_pdf_all/2807.pdf██████████████████-----------------| 66.9% Complete
escuela de biologia

escuela de quimica
../thesis_pdf_all/2917.pdf███████████████████████████--------| 85.6% Complete
escuela de quimica
../thesis_pdf_all/2916.pdf███████████████████████████--------| 85.9% Complete
escuela de computacion
../thesis_pdf_all/2915.pdf████████████████████████████-------| 86.2% Complete
escuela de quimica
../thesis_pdf_all/2914.pdf████████████████████████████-------| 86.4% Complete
escuela de computacion
../thesis_pdf_all/2913.pdf████████████████████████████-------| 86.7% Complete
escuela de quimica
../thesis_pdf_all/2912.pdf████████████████████████████-------| 86.9% Complete
escuela de biologia
../thesis_pdf_all/2911.pdf████████████████████████████-------| 87.2% Complete
escuela de biologia
../thesis_pdf_all/2910.pdf████████████████████████████-------| 87.4% Complete
escuela de computacion
../thesis_pdf_all/2909.pdf████████████████████████████-------| 87.7% Complete
escuela de fisica
../thesis_pdf_all/2908.pdf████████████████████████████-------| 87.9% Complete
escuela de comput

In [38]:
df_splited[19].to_csv("./data/splitted/thesis_df_splited19_with_school.csv", index=False)

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
# values_c_3 = check_vec("../"+df_splited[3]["path"],l)
# df_splited[3]['school_simple'] = values_c_3.tolist()

In [54]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source0 = "./data/splitted/thesis_df_splited0_with_school.csv"
csv_source1 = "./data/splitted/thesis_df_splited1_with_school.csv"
csv_source2 = "./data/splitted/thesis_df_splited2_with_school.csv"
csv_source3 = "./data/splitted/thesis_df_splited3_with_school.csv"
csv_source4 = "./data/splitted/thesis_df_splited4_with_school.csv"
csv_source5 = "./data/splitted/thesis_df_splited5_with_school.csv"
csv_source6 = "./data/splitted/thesis_df_splited6_with_school.csv"
csv_source7 = "./data/splitted/thesis_df_splited7_with_school.csv"
csv_source8 = "./data/splitted/thesis_df_splited8_with_school.csv"
csv_source9 = "./data/splitted/thesis_df_splited9_with_school.csv"
csv_source10 = "./data/splitted/thesis_df_splited10_with_school.csv"
csv_source11 = "./data/splitted/thesis_df_splited11_with_school.csv"
csv_source12 = "./data/splitted/thesis_df_splited12_with_school.csv"
csv_source13 = "./data/splitted/thesis_df_splited13_with_school.csv"
csv_source14 = "./data/splitted/thesis_df_splited14_with_school.csv"
csv_source15 = "./data/splitted/thesis_df_splited15_with_school.csv"
csv_source16 = "./data/splitted/thesis_df_splited16_with_school.csv"
csv_source17 = "./data/splitted/thesis_df_splited17_with_school.csv"
csv_source18 = "./data/splitted/thesis_df_splited18_with_school.csv"
csv_source19 = "./data/splitted/thesis_df_splited19_with_school.csv"

# 
csv_a = []
csv_a.append(pd.read_csv(csv_source0))
csv_a.append(pd.read_csv(csv_source1))
csv_a.append(pd.read_csv(csv_source2))
csv_a.append(pd.read_csv(csv_source3))
csv_a.append(pd.read_csv(csv_source4))
csv_a.append(pd.read_csv(csv_source5))
csv_a.append(pd.read_csv(csv_source6))
csv_a.append(pd.read_csv(csv_source7))
csv_a.append(pd.read_csv(csv_source8))
csv_a.append(pd.read_csv(csv_source9))
csv_a.append(pd.read_csv(csv_source10))
csv_a.append(pd.read_csv(csv_source11))
csv_a.append(pd.read_csv(csv_source12))
csv_a.append(pd.read_csv(csv_source13))
csv_a.append(pd.read_csv(csv_source14))
csv_a.append(pd.read_csv(csv_source15))
csv_a.append(pd.read_csv(csv_source16))
csv_a.append(pd.read_csv(csv_source17))
csv_a.append(pd.read_csv(csv_source18))
csv_a.append(pd.read_csv(csv_source19))



In [55]:
csv_a[1]

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,5422,27-Mar-2017,Desarrollo de un proceso SCT-CPO para la oxida...,"Estrella, Y. Rafael, E.",2.72 MB,http://saber.ucv.ve/handle/10872/15410,http://saber.ucv.ve/bitstream/10872/15410/1/TE...,,thesis_pdf_all/5422.pdf,True,True,False,No School
1,5421,27-Mar-2017,Estudio de formación de complejos de vanadio (...,"Caraballo, Yerimber",1.48 MB,http://saber.ucv.ve/handle/10872/15409,http://saber.ucv.ve/bitstream/10872/15409/3/TE...,,thesis_pdf_all/5421.pdf,True,True,False,escuela de quimica
2,5420,27-Mar-2017,Diseño de una metodología técnica para la eval...,"Esperante C., Isabel C.",2.83 MB,http://saber.ucv.ve/handle/10872/15408,http://saber.ucv.ve/bitstream/10872/15408/1/Mi...,,thesis_pdf_all/5420.pdf,True,True,False,escuela de ingenieria quimica
3,5419,27-Mar-2017,Factibilidad del uso de la técnica multicompon...,"Ramos A., Simón E.",7.26 MB,http://saber.ucv.ve/handle/10872/15406,http://saber.ucv.ve/bitstream/10872/15406/1/TE...,,thesis_pdf_all/5419.pdf,True,True,False,"escuela de geologia, minas y geofisica"
4,5446,28-Mar-2017,Diagnóstico de las condiciones físicas de los ...,"Nava A., Javier J.",45.7 MB,http://saber.ucv.ve/handle/10872/15483,http://saber.ucv.ve/bitstream/10872/15483/1/Tr...,,thesis_pdf_all/5446.pdf,True,True,False,"escuela de geologia, minas y geofisica"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,5843,20-Sep-2017,"Diseño, construcción y calibración de equipo p...","Rivadeneyra A., Eduardo M.",4.73 MB,http://saber.ucv.ve/handle/10872/16385,http://saber.ucv.ve/bitstream/10872/16385/1/Tr...,,thesis_pdf_all/5843.pdf,True,True,False,"escuela de geologia, minas y geofisica"
386,5841,20-Sep-2017,Aplicación de radar de penetración de suelos (...,"Sánchez M., Kerly L.",4.79 MB,http://saber.ucv.ve/handle/10872/16383,http://saber.ucv.ve/bitstream/10872/16383/1/Te...,,thesis_pdf_all/5841.pdf,True,True,False,"escuela de geologia, minas y geofisica"
387,5828,19-Sep-2017,Establecimiento de los parámetros mineros-geom...,"Acosta G., Magda C.",4.27 MB,http://saber.ucv.ve/handle/10872/16328,http://saber.ucv.ve/bitstream/10872/16328/1/TE...,,thesis_pdf_all/5828.pdf,True,True,False,"escuela de geologia, minas y geofisica"
388,5840,20-Sep-2017,Estudio de factibilidad sísmica para la constr...,"Suniaga S., José F.",8.44 MB,http://saber.ucv.ve/handle/10872/16381,http://saber.ucv.ve/bitstream/10872/16381/1/Te...,,thesis_pdf_all/5840.pdf,True,True,False,No School


In [57]:
b = pd.concat(csv_a,axis=0)
b

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
1,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False,"escuela de geologia, minas y geofisica"
2,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False,escuela de quimica
3,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False,escuela de quimica
4,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False,escuela de quimica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2925,16-Dec-2014,Identificación y clonación del gen que codific...,"Ibarra, Ana V.",3.41 MB,http://saber.ucv.ve/handle/10872/8005,http://saber.ucv.ve/bitstream/10872/8005/1/Tes...,,thesis_pdf_all/2925.pdf,True,True,False,No School
386,2924,16-Dec-2014,Estudio del desarrollo postnatal de la extensi...,"Pernía, Marianny J.",6.14 MB,http://saber.ucv.ve/handle/10872/7995,http://saber.ucv.ve/bitstream/10872/7995/1/Tes...,,thesis_pdf_all/2924.pdf,True,True,False,escuela de biologia
387,2923,16-Dec-2014,Zooplancton en seis cuerpos de agua de la regi...,"Baptista, Esther N.",3.43 MB,http://saber.ucv.ve/handle/10872/8006,http://saber.ucv.ve/bitstream/10872/8006/1/Tes...,,thesis_pdf_all/2923.pdf,True,True,False,escuela de biologia
388,2922,15-Dec-2014,Distribución de la enzima 3α-hidroxiesteroide ...,"Alcalá, Katherine A.",30.67 MB,http://saber.ucv.ve/handle/10872/7989,http://saber.ucv.ve/bitstream/10872/7989/1/Tes...,,thesis_pdf_all/2922.pdf,True,True,False,escuela de biologia


In [60]:
df = b[b['school_simple']!='No School']
df

Unnamed: 0,index,thesis_year,thesis_title,thesis_author,size,thesis_link,pdf_link,resumen,path,exist,isPDF,isScan,school_simple
0,1,2009,Estudio de sobretensiones por maniobra debido ...,"De Gregorio S., Luis T.",6.77 MB,http://saber.ucv.ve/handle/10872/17712,http://saber.ucv.ve/bitstream/10872/17712/1/TE...,,thesis_pdf_all/1.pdf,True,True,False,escuela de ingenieria electrica
1,5539,29-Mar-2017,"Diseño de manejo de aguas de mina, en el PIT b...","Van Zanten V., Johan O.",6.7 MB,http://saber.ucv.ve/handle/10872/15542,http://saber.ucv.ve/bitstream/10872/15542/1/TE...,,thesis_pdf_all/5539.pdf,True,True,False,"escuela de geologia, minas y geofisica"
2,5573,30-Mar-2017,Identificación y cuantificación de hidrocarbur...,"Hergueta Possamai, Verónica Yrama",3.63 MB,http://saber.ucv.ve/handle/10872/15578,http://saber.ucv.ve/bitstream/10872/15578/1/TE...,,thesis_pdf_all/5573.pdf,True,True,False,escuela de quimica
3,5572,30-Mar-2017,Evaluación de catalizadores soportados de Pd-M...,"Tovar Flores, Manuel Enrique",2.91 MB,http://saber.ucv.ve/handle/10872/15579,http://saber.ucv.ve/bitstream/10872/15579/1/EV...,,thesis_pdf_all/5572.pdf,True,True,False,escuela de quimica
4,5571,30-Mar-2017,Estudio de la solubilidad para sistemas binari...,"Rampersad Avilán, Jeniree G.",1.55 MB,http://saber.ucv.ve/handle/10872/15580,http://saber.ucv.ve/bitstream/10872/15580/1/te...,,thesis_pdf_all/5571.pdf,True,True,False,escuela de quimica
...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,2926,16-Dec-2014,Identificación de proteínas que interaccionan ...,"Navas, Victoria H.",2 MB,http://saber.ucv.ve/handle/10872/8000,http://saber.ucv.ve/bitstream/10872/8000/1/Tes...,,thesis_pdf_all/2926.pdf,True,True,False,escuela de biologia
386,2924,16-Dec-2014,Estudio del desarrollo postnatal de la extensi...,"Pernía, Marianny J.",6.14 MB,http://saber.ucv.ve/handle/10872/7995,http://saber.ucv.ve/bitstream/10872/7995/1/Tes...,,thesis_pdf_all/2924.pdf,True,True,False,escuela de biologia
387,2923,16-Dec-2014,Zooplancton en seis cuerpos de agua de la regi...,"Baptista, Esther N.",3.43 MB,http://saber.ucv.ve/handle/10872/8006,http://saber.ucv.ve/bitstream/10872/8006/1/Tes...,,thesis_pdf_all/2923.pdf,True,True,False,escuela de biologia
388,2922,15-Dec-2014,Distribución de la enzima 3α-hidroxiesteroide ...,"Alcalá, Katherine A.",30.67 MB,http://saber.ucv.ve/handle/10872/7989,http://saber.ucv.ve/bitstream/10872/7989/1/Tes...,,thesis_pdf_all/2922.pdf,True,True,False,escuela de biologia


In [None]:
df

In [None]:
#append to dataframe the column of the matching school and create a csv
df['school_simple'] = values_c.tolist()
df.to_csv("./data/thesis_7801_with_school.csv", index=False)

In [None]:
## get dataframe with only schools tag thesis
csv_source = "./data/thesis_7801_with_school.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [66]:
#size of data frame
l = len(df.index)
l

7165

In [None]:
df = df[df['school_simple']=='No School']
df

In [61]:
schools.create_dicts()

In [62]:
schools.escuelas_unaccent_dict

{'escuela de ingenieria civil': 0,
 'escuela de ingenieria electrica': 1,
 'escuela de geologia, minas y geofisica': 2,
 'escuela de ingenieria quimica': 3,
 'escuela de ingenieria de petroleo': 4,
 'escuela de ingenieria mecanica': 5,
 'escuela de ingenieria metalurgica y ciencia de los materiales': 6,
 'escuela de agronomia': 7,
 'escuela de arquitectura': 8,
 'escuela de biologia': 9,
 'escuela de computacion': 10,
 'escuela de fisica': 11,
 'escuela de geoquimica': 12,
 'escuela de matematica': 13,
 'escuela de quimica': 14,
 'escuela de administracion y contaduria': 15,
 'escuela de antropologia': 16,
 'escuela de estadistica y ciencias actuariales': 17,
 'escuela de economia': 18,
 'escuela de estudios internacionales': 19,
 'escuela de sociologia': 20,
 'escuela de trabajo social': 21,
 'escuela de derecho': 22,
 'escuela de estudios politicos y administrativo': 23,
 'escuela de ciencias veterinarias': 24,
 'escuela de farmacia': 25,
 'escuela de artes': 26,
 'escuela de bibliot

In [63]:
# make the correct data frame first then vectorize
check_vec_accents = np.vectorize(schools.set_schools_accents)
schools.create_dicts()

In [64]:
df['school_simple']

0             escuela de ingenieria electrica
1      escuela de geologia, minas y geofisica
2                          escuela de quimica
3                          escuela de quimica
4                          escuela de quimica
                        ...                  
384                       escuela de biologia
386                       escuela de biologia
387                       escuela de biologia
388                       escuela de biologia
389                     escuela de sociologia
Name: school_simple, Length: 7165, dtype: object

In [65]:
schools.create_dicts()

In [67]:
values_accent = check_vec_accents(df['school_simple'],l)

Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 

In [68]:
values_accent

array(['Escuela de Ingeniería Eléctrica',
       'Escuela de Geología, Minas y Geofísica', 'Escuela de Química',
       ..., 'Escuela de Biología', 'Escuela de Biología',
       'Escuela de Sociología'], dtype='<U61')

In [69]:
#append to dataframe the column of the matching school and create a csv with the correct name
df['school_complex'] = values_accent.tolist()
df.to_csv("./data/thesis_7165_with_resumen_school_complex.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['school_complex'] = values_accent.tolist()
