In [None]:
import json
import pdfplumber
import spacy
import pandas as pd
import numpy as np
import os
import unidecode
import PyPDF2
from tika import parser
from spacy.matcher import PhraseMatcher
from spacy import displacy
from progress_bar.progress_bar import printProgressBar

class School:
    def __init__(self, file_source):
        file = open(file_source, "r")
        file = json.load(file)
        temp_list = []
        for facultad in file:
            temp_list.append(facultad['escuela'])
        #print(facultad['escuela'])
        self.escuelas = [item for sublist in temp_list for item in sublist] # make the list flat
        #have the escuelas with accents in the correct form here
        self.escuelas_accent = self.escuelas
        
        print(self.escuelas)
        self.i = 0
        self.j = 0
        self.k = 0
        self.p = 0
        
    def create_dictionary(self, schools):
        myDict = dict((e,i) for i,e in enumerate(schools))
        return myDict

    def unaccent_list(self, accent_list):
        unaccented_schools = []
        for sch in accent_list:
            unaccented_schools.append(unidecode.unidecode(sch).lower())
        return unaccented_schools
    
    def set_school_to_unaccent(self):
        self.escuelas = self.unaccent_list(self.escuelas)
        
    def create_dicts(self):
        #create the dicts only when schools are unaccented
        self.escuelas_unaccent_dict = self.create_dictionary(self.escuelas)
        self.escuelas_accent_dict = self.create_dictionary(self.escuelas_accent)
        
    def set_schools_accents(self, row, l):
        self.k+= 1
        printProgressBar(self.k, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        
        index = self.escuelas_unaccent_dict.get(row.lower())
        key_list = list(self.escuelas_accent_dict.keys())
        val_list = list(self.escuelas_accent_dict.values())
        try:
            position = val_list.index(index)
            return key_list[position]
        except:
            return None
        #return the value of the position, example in dict '{..., Escuela de enfermería: 37, ...}' it will return 
        #'Escuela de enfermería'
    
    def clean_spaces_text(self, text):
        new_text = " ".join(text.split())
        return(new_text)
    
    def set_nlp(self, model):
        self.nlp_model = spacy.load(model)
        
    def set_matcher(self):
        self.matcher = PhraseMatcher(self.nlp_model.vocab, attr="LOWER")
        patterns = [self.nlp_model(name) for name in self.escuelas]
        self.matcher.add("ESC", patterns)
        
    def check_file_tika(self, file_source, l):
        self.p+=1
        printProgressBar(self.p, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        parsed_pdf = parser.from_file(file_source)
        data = parsed_pdf['content']
        new_data = clean_spaces_text(data)
        #print(new_data[0:90000])
        new_data = unidecode.unidecode(new_data).lower()
        doc = self.nlp_model(new_data)
        for match_id, start, end in self.matcher(doc):
            return(doc[start:end])
        
    def check_file(self, file_source, l):
        self.i+=1
        printProgressBar(self.i, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                print(pdf.pages[i].extract_text())
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            #first 10 pages
            pages_10_u = self.unaccent_list(pages_10)
            for page in pages_10_u:
                doc = self.nlp_model(page)
                if len(self.matcher(doc)) >=1:
                    for match_id, start, end in self.matcher(doc):
                        return(doc[start:end]) #returns at the first instance
            #last 10 pages
            pages_10_l = self.unaccent_list(pages_10_l)
            for page in pages_10_l:
                doc = self.nlp_model(page)
                if len(self.matcher(doc)) >=1:
                    for match_id, start, end in self.matcher(doc):
                        return(doc[start:end]) #returns at the first instance
        return "No school"
    def create_training_set(self, file_source, l):
        self.j+=1
        printProgressBar(self.j, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        pages_10 = []
        pages_10_l = []
        school_name_of_file = ""
        with pdfplumber.open(file_source) as pdf:
            for i in range(0,10):
                pages_10.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
            for i in reversed(range(len(pdf.pages)-10,len(pdf.pages))):
                pages_10_l.append(self.clean_spaces_text(pdf.pages[i].extract_text()))
        
        #first 10 pages
        pages_10_u = self.unaccent_list(pages_10)
        for page in pages_10_u:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        #last 10 pages
        pages_10_l = self.unaccent_list(pages_10_l)
        for page in pages_10_l:
            doc = self.nlp_model(page)
            if len(self.matcher(doc)) >=1:
                for match_id, start, end in self.matcher(doc):
                    aux_training = [page, {"entities": (start, end, 'ESC')}]
                    return aux_training #returns at the first instance
        return "No school"
    def print_training_set(self):
        print(self.training_set)
        

In [None]:
spacy.prefer_gpu()

In [None]:
def clean_spaces_text(text):
        new_text = " ".join(text.split())
        return(new_text)

In [None]:
schools = School("data/escuelas.json")

In [None]:
#delete accents
schools.set_school_to_unaccent()

In [None]:
#load model set matcher for schools
schools.set_nlp('es_core_news_sm')
schools.set_matcher()

In [None]:
import pandas as pd
import numpy as np
import os

## get dataframe with text only thesis
csv_source = "data/url_thesis_8211_with_pdf_scan_image.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [None]:
#size of data frame
l = len(df.index)
l

In [None]:
# schools.check_file_tika('../'+'thesis_pdf_all/762.pdf',1)

In [None]:
# parsed_pdf = parser.from_file('../'+'thesis_pdf_all/200.pdf')
  
# # saving content of pdf
# # you can also bring text only, by parsed_pdf['text'] 
# # parsed_pdf['content'] returns string 
# data = parsed_pdf['content'] 
# #   
# # Printing of content
# print(type(data))
# #print(data)
# new_data = clean_spaces_text(data)
# #print(new_data[0:90000])
# new_data = unidecode.unidecode(new_data).lower()

In [None]:
# doc = self.nlp_model(new_data)

In [None]:
# with open('../'+'thesis_pdf_all/200.pdf', mode='rb') as f:
#     reader = PyPDF2.PdfFileReader(f)
#     page = reader.getPage(1)
#     print(clean_spaces_text(page.extractText()))

In [None]:
# schools.check_file('../'+'thesis_pdf_all/200.pdf',1)

In [None]:
#make this fucntion a vectorize so it can run in a data frame
check_vec = np.vectorize(schools.check_file_tika)

In [None]:
#Ignore this
#schools.create_training_set("../thesis_pdf/1.pdf", 1)

In [None]:
#create a list of values with the school column, the l is the total ammount of thesis to check
values_c = check_vec("../"+df["path"],l)

In [None]:
#append to dataframe the column of the matching school and create a csv
df['school_simple'] = values_c.tolist()
df.to_csv("./data/thesis_7801_with_school.csv", index=False)

In [None]:
## get dataframe with only schools tag thesis
csv_source = "./data/thesis_7801_with_school.csv"
df = pd.read_csv(csv_source)
df = df[df['isScan']==False]
df = df.sort_values('isScan', ascending=False)

In [None]:
#size of data frame
l = len(df.index)
l

In [None]:
schools.create_dicts()

In [None]:
schools.escuelas_unaccent_dict

In [None]:
# make the correct data frame first then vectorize
check_vec_accents = np.vectorize(schools.set_schools_accents)
schools.create_dicts()

In [None]:
df['school_simple']

In [None]:
schools.create_dicts()

In [None]:
values_accent = check_vec_accents(df['school_simple'],l)

In [None]:
values_accent

In [None]:
#append to dataframe the column of the matching school and create a csv with the correct name
df['school_complex'] = values_accent.tolist()
df.to_csv("./data/thesis_7801_with_resumen_school_complex.csv", index=False)