In [1]:
import json
import pdfplumber
import spacy
import pandas as pd
import numpy as np
import os
import unidecode
from spacy.matcher import PhraseMatcher
from spacy import displacy
from progress_bar.progress_bar import printProgressBar

class School:
    def __init__(self, file_source):
        file = open(file_source, "r")
        file = json.load(file)
        temp_list = []
        for facultad in file:
            temp_list.append(facultad['escuela'])
        #print(facultad['escuela'])
        self.escuelas = [item for sublist in temp_list for item in sublist] # make the list flat
        print(self.escuelas)
        self.i = 0
        self.j = 0
        
    def unaccent_list(self, accent_list):
        unaccented_schools = []
        for sch in accent_list:
            unaccented_schools.append(unidecode.unidecode(sch))
        return unaccented_schools
    
    def set_school_to_unaccent(self):
        self.escuelas = self.unaccent_list(self.escuelas)
    
    def clean_spaces_text(self, text):
        new_text = " ".join(text.split())
        return(new_text)
    
    def set_nlp(self, model):
        self.nlp_model = spacy.load(model)
        
    def set_matcher(self):
        matcher = PhraseMatcher(self.nlp_model.vocab, attr="LOWER")
        patterns = [self.nlp_model(name) for name in self.escuelas]
        matcher.add("ESC", patterns)
        
    def check_file(self, file_source, l):
        self.i+=1
        printProgressBar(self.i, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
        
        #first 10 pages
        pages_10_u = self.unaccent_list(pages_10)
        for page in pages_10_u:
            doc = self.nlp_model(page)
            if len(matcher(doc)) >=1:
                for match_id, start, end in matcher(doc):
                    return(doc[start:end]) #returns at the first instance
        #last 10 pages
        pages_10_l = self.unaccent_list(pages_10_l)
        for page in pages_10_l:
            doc = self.nlp_model(page)
            if len(matcher(doc)) >=1:
                for match_id, start, end in matcher(doc):
                    return(doc[start:end]) #returns at the first instance
        return "No school"
    def create_training_set(self, file_source, l):
        self.j+=1
        printProgressBar(self.i, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        self.training_set = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
        
        #first 10 pages
        pages_10_u = self.unaccent_list(pages_10)
        for page in pages_10_u:
            doc = self.nlp_model(page)
            if len(matcher(doc)) >=1:
                for match_id, start, end in matcher(doc):
                    aux_training = [page, {"entities": [(start, end, 'ESC')]}]
                    self.training_set.append(aux_training)
                    pass #returns at the first instance
        #last 10 pages
        pages_10_l = self.unaccent_list(pages_10_l)
        for page in pages_10_l:
            doc = self.nlp_model(page)
            if len(matcher(doc)) >=1:
                for match_id, start, end in matcher(doc):
                    aux_training = [page, {"entities": [(start, end, 'ESC')]}]
                    self.training_set.append(aux_training)
                    pass #returns at the first instance
        return "No school"
    def print_training_set(self):
        print()
        

In [None]:
schools = School("data/escuelas.json")

In [None]:
#delete accents
schools.set_school_to_unaccent()

In [None]:
#load model set matcher for schools
schools.set_nlp('es_core_news_sm')
schools.set_matcher()

In [None]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source = "data/url_thesis_200_with_scan.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [None]:
#size of data frame
l = len(df.index)
l

In [None]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file)