# Quality check of the OCRs from BNC

In [2]:
import os
import itertools
import json
import spacy
import chardet
from langdetect import detect_langs
import numpy as np
import pandas as pd
import plotly.express as px

# from transformers import pipeline
# from datasets import load_dataset

In [3]:
pd.set_option('display.max_rows', 500)
np.seterr(invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

## Create dictionaries containing all words from each language

In [4]:
# Load full english, spanish, catalan, french, italian and german vocabularies from ubuntu's package
languages_path = "/usr/share/dict"
os.listdir(languages_path)

['ngerman',
 'catala',
 'cracklib-small',
 'american-english',
 'french',
 'README.select-wordlist',
 'catalan',
 'words.pre-dictionaries-common',
 'ogerman',
 'italian',
 'british-english',
 'swiss',
 'words',
 'spanish']

In [5]:
english = os.path.join(languages_path, "british-english")
spanish = os.path.join(languages_path, "spanish")
catalan = os.path.join(languages_path, "catalan")
french = os.path.join(languages_path, "french")
italian = os.path.join(languages_path, "italian")
german = os.path.join(languages_path, "ngerman")

In [6]:
english_dict = open(english).readlines()
spanish_dict = open(spanish).readlines()
catalan_dict = open(catalan).readlines()
french_dict = open(french).readlines()
italian_dict = open(italian).readlines()

In [7]:
print(english_dict[:100])

['A\n', 'AA\n', 'AAA\n', "AA's\n", 'AB\n', 'ABC\n', "ABC's\n", 'ABCs\n', 'ABM\n', "ABM's\n", 'ABMs\n', "AB's\n", 'AC\n', 'ACLU\n', "ACLU's\n", 'ACT\n', 'ACTH\n', "ACTH's\n", "AC's\n", 'AF\n', 'AFAIK\n', 'AFC\n', "AFC's\n", 'AI\n', 'AIDS\n', "AIDS's\n", "AI's\n", 'AIs\n', 'AK\n', 'AL\n', 'AM\n', 'AMA\n', 'AMD\n', "AMD's\n", "AM's\n", 'ANSI\n', 'ANSIs\n', 'ANZUS\n', "ANZUS's\n", 'AOL\n', "AOL's\n", 'AP\n', 'API\n', 'APO\n', "AP's\n", 'AR\n', 'ASAP\n', 'ASCII\n', "ASCII's\n", 'ASCIIs\n', 'ASL\n', "ASL's\n", 'ASPCA\n', 'ATM\n', "ATM's\n", 'ATP\n', "ATP's\n", 'ATV\n', 'AV\n', 'AWACS\n', "AWACS's\n", 'AWOL\n', "AWOL's\n", 'AWS\n', "AWS's\n", 'AZ\n', 'AZT\n', "AZT's\n", "AZ's\n", 'Aachen\n', "Aachen's\n", 'Aaliyah\n', "Aaliyah's\n", 'Aaron\n', "Aaron's\n", 'Abbas\n', 'Abbasid\n', "Abbasid's\n", "Abbas's\n", 'Abbott\n', "Abbott's\n", 'Abby\n', "Abby's\n", 'Abdul\n', "Abdul's\n", 'Abe\n', 'Abel\n', 'Abelard\n', "Abelard's\n", "Abel's\n", 'Abelson\n', "Abelson's\n", 'Aberdeen\n', "Aberdeen's\n",

In [8]:
print(catalan_dict[:100])

['AG\n', 'AGFA\n', 'AGL\n', 'AM\n', 'APLEC\n', 'ASCII\n', 'Aaron\n', 'Abada\n', 'Abadal\n', 'Abadals\n', 'Abadessa\n', 'Abadesses\n', 'Abadia\n', 'Abarca\n', 'Abarques\n', 'Abat\n', 'Abdalong\n', 'Abdies\n', 'Abdon\n', 'Abel\n', 'Abelard\n', 'Abell\n', 'Abella\n', 'Abellan\n', 'Abellana\n', 'Abellanes\n', 'Abellà\n', 'Abellí\n', 'Abelló\n', 'Aberci\n', 'Abili\n', 'Abissínia\n', 'Abkhàzia\n', 'Abraham\n', 'Abrera\n', 'Abriac\n', 'Abrian\n', 'Abriat\n', 'Abril\n', 'Abrines\n', 'Abruços\n', 'Abrí\n', 'Absalò\n', 'Absaló\n', 'Abulí\n', 'Abundanci\n', 'Abundi\n', 'Abundància\n', 'Acaci\n', 'Acaia\n', 'Acari\n', 'Acarnània\n', 'Accad\n', 'Aché\n', 'Aciscle\n', 'Acurci\n', 'Acàdia\n', 'Ada\n', 'Adabald\n', 'Adalbert\n', 'Adalberó\n', 'Adalgot\n', 'Adalric\n', 'Adalsinda\n', 'Adam\n', 'Adarró\n', 'Adaucte\n', 'Adela\n', 'Adelaida\n', 'Adelard\n', 'Adelbert\n', 'Adelf\n', 'Adelina\n', 'Adell\n', 'Adellac\n', 'Adelvina\n', 'Adelí\n', 'Ademar\n', 'Ademús\n', 'Adeodat\n', 'Adern\n', 'Adiló\n', 'Ad

In [9]:
def process_words(elem):
    return elem.lower().replace("\n", "")

In [10]:
english_set = set(map(process_words, english_dict))
spanish_set = set(map(process_words, spanish_dict))
catalan_set = set(map(process_words, catalan_dict))
french_set = set(map(process_words, french_dict))
italian_set = set(map(process_words, italian_dict))

In [11]:
print("english vocabulary lenght:", len(english_set))
print("spanish vocabulary lenght:", len(spanish_set))
print("catalan vocabulary lenght:", len(catalan_set))
print("french vocabulary lenght:", len(french_set))
print("italian vocabulary lenght:", len(italian_set))

english vocabulary lenght: 101668
spanish vocabulary lenght: 86014
catalan vocabulary lenght: 609711
french vocabulary lenght: 346200
italian vocabulary lenght: 116751


In [12]:
## Load a larger spanish dicctionary with Wikipedia corpus
WORDS_spanish = json.load(open("../data/processed/WORDS_spanish.json", "r"))
spanish_set_new = set(WORDS_spanish.keys())
spanish_set_old = spanish_set

In [13]:
print("old spanish vocabulary lenght:", len(spanish_set_old))
print("new spanish vocabulary lenght:", len(spanish_set_new))

old spanish vocabulary lenght: 86014
new spanish vocabulary lenght: 223914


In [14]:
# new words are mostly plurals and verb conjugations
print(len(spanish_set_new.difference(spanish_set_old)))
print(list(spanish_set_new.difference(spanish_set_old))[:100])

188049
['dharmakirti', 'lucrecia', 'manejó', 'imaginarte', 'abrevia', 'persuader', 'desobediencias', 'seiryuto', 'tns', 'revolverlas', 'pajín', 'ícaro', 'intoxicantes', 'leérsela', 'rescataremos', 'purge', 'emite', 'cepillos', 'perturbados', 'laterano', 'ejercerlas', 'patagonia', 'pasivantes', 'bidimencionales', 'escenificando', 'entronice', 'strokes', 'apéate', 'guinness', 'constanza', 'editabilidad', 'torciendo', 'impuse', 'encaminarlo', 'enfrascados', 'pinochet', 'definío', 'arandelas', 'empuñados', 'deshielos', 'impolutas', 'harbhajan', 'resucitas', 'aumentose', 'quebramos', 'tortolica', 'relampaguearon', 'amontonaré', 'kackeier', 'aviñoneses', 'absortos', 'signatorios', 'canova', 'óseas', 'ahead', 'pesara', 'extirpa', 'implementation', 'suplencia', 'adormecerle', 'interfase', 'mandarinas', 'volcaremos', 'engolosinarlas', 'realistas', 'controlarla', 'últimalos', 'celayeta', 'partióles', 'tomlinson', 'tcica', 'almuerzas', 'multiplexada', 'encabezadas', 'transexuales', 'falansteriano

In [15]:
spanish_set = spanish_set_old.union(spanish_set_new)

In [16]:
print("english:", set(itertools.islice(english_set, 50)), sep="\n", end="\n\n")
print("spanish:", set(itertools.islice(spanish_set, 50)), sep="\n", end="\n\n")
print("catalan:", set(itertools.islice(catalan_set, 50)), sep="\n", end="\n\n")
print("french:", set(itertools.islice(french_set, 50)), sep="\n", end="\n\n")
print("italian:", set(itertools.islice(italian_set, 50)), sep="\n", end="\n\n")

english:
{'yogis', 'thrashes', 'cheetah', 'anchorage', "garb's", 'innocently', 'tourist', 'overspreads', 'pups', 'fecundity', 'permanence', "telegraph's", 'salvadorans', 'bullfighter', 'hairsplitting', "ruffle's", 'overbook', 'sortied', "clump's", 'spaceship', 'spitefully', 'percolator', "xterm's", 'overused', 'bracket', 'purge', 'skylarks', 'corks', 'lubber', "imposture's", 'patagonia', 'channelled', 'anterior', "styrofoam's", "saline's", 'strokes', 'gpa', 'guinness', 'markab', 'eyeliner', 'outsold', "thou's", 'fe', 'pinochet', 'reconnoitred', "competitiveness's", 'spumone', 'vouchers', "dilution's", 'expurgating'}

spanish:
{'dharmakirti', 'amontonaré', 'kackeier', 'aviñoneses', 'arbotante', 'dianense', 'imaginarte', 'enorfanecida', 'carneruno', 'reprender', 'inarticulado', 'parafraseador', 'persuader', 'desobediencias', 'ahead', 'implementation', 'suplencia', 'esquilmo', 'tns', 'salma', 'hirviente', 'masticina', 'mandarinas', 'cembo', 'pajín', 'ícaro', 'intoxicantes', 'leérsela', 'r

In [17]:
# remove one character words and two characters words when both are consonants
vowels = {}
vowels["ca"] = ["a", "e", "i", "o", "u", "à", "è", "é", "í", "ò", "ó", "ú", "ï", "ö"]
vowels["es"] = ["a", "e", "i", "o", "u", "á", "é", "í", "ó", "ú"]
vowels["fr"] = ["a", "e", "i", "o", "u", "à", "â", "è", "é", "ê", "í", "î", "ò", "ó", "ù", "û"]
vowels["it"] = ["a", "e", "i", "o", "u", "à", "á", "è", "é", "ì" "í", "ò", "ù"]
vowels["en"] = ["a", "e", "i", "o", "u"]

def check_valid_words(word, lang):
    if len(word) == 1: 
        return False
    elif (len(word) == 2) and (word[0] not in vowels[lang]) and (word[1] not in vowels[lang]):
        return False
    else: 
        return True

In [18]:
spanish_set = set(filter(lambda x: check_valid_words(x, "es"), spanish_set))
english_set = set(filter(lambda x: check_valid_words(x, "en"), english_set))
catalan_set = set(filter(lambda x: check_valid_words(x, "ca"), catalan_set))
italian_set = set(filter(lambda x: check_valid_words(x, "it"), italian_set))
french_set = set(filter(lambda x: check_valid_words(x, "fr"), french_set))

In [19]:
# create spanish character dictionary
char_spanish_set = set()
for word in spanish_set:
    for char in word:
        char_spanish_set.add(char)
char_spanish_set.add(" ")

In [20]:
char_spanish_list = list(char_spanish_set)
char_spanish_list.sort()
print(char_spanish_list)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'á', 'é', 'í', 'ñ', 'ó', 'ú', 'ü']


In [21]:
[el for el in list(catalan_set) if len(el) < 3]

['sí',
 'er',
 'en',
 'up',
 'or',
 'pe',
 'om',
 'ió',
 'dà',
 'oh',
 'ip',
 'et',
 'só',
 'ag',
 'si',
 'ce',
 'nu',
 'ad',
 'jo',
 'ah',
 'bo',
 'mo',
 'as',
 'di',
 'eó',
 'ei',
 'ús',
 'be',
 'au',
 'ça',
 'to',
 'ix',
 'es',
 'xa',
 'am',
 'do',
 'em',
 'al',
 'na',
 'da',
 'ta',
 'lo',
 'fa',
 'un',
 'ot',
 'xo',
 'ex',
 'fi',
 'ou',
 'so',
 'sé',
 've',
 're',
 'el',
 'iu',
 'ma',
 'fe',
 'oí',
 'xe',
 'ha',
 'no',
 'pi',
 'eh',
 'ne',
 'is',
 'ol',
 'ai',
 'oi',
 'ge',
 'ni',
 'me',
 'ut',
 'hi',
 'li',
 'te',
 'la',
 'sa',
 'ço',
 'uf',
 'és',
 'us',
 'oc',
 'ep',
 'tu',
 'du',
 'té',
 'ro',
 'va',
 'um',
 'ec',
 'ca',
 'ós',
 'dí',
 'oï',
 'eu',
 'ja',
 'se',
 'on',
 'it',
 'vi',
 'pa',
 'mi',
 'os',
 'mà',
 'in',
 'ho',
 'bé',
 'de',
 'op',
 'he',
 'ur',
 'cu',
 'uc',
 'ui',
 'ac']

## Stop words

In [22]:
## spacy stopwords
nlp_ca = spacy.load("ca_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")
nlp_it = spacy.load("it_core_news_sm")
nlp_es = spacy.load("es_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
nlp_pt = spacy.load("pt_core_news_sm")

In [23]:
stop_words_ca = nlp_ca.Defaults.stop_words
stop_words_fr = nlp_fr.Defaults.stop_words
stop_words_it = nlp_it.Defaults.stop_words
stop_words_es = nlp_es.Defaults.stop_words
stop_words_en = nlp_en.Defaults.stop_words
stop_words_pt = nlp_pt.Defaults.stop_words

In [24]:
# create a vocabulary and a stop words sets using all languages
vocabulary_all = french_set.union(catalan_set.union(italian_set.union(spanish_set)))
stop_words_all = stop_words_ca.union(stop_words_fr.union(stop_words_it.union(stop_words_es.union(stop_words_en \
                    .union(stop_words_pt)))))

In [25]:
vocabularies = {
    "es": {
        "vocabulary": spanish_set,
        "stop_words": stop_words_es
    },
    "ca": {
        "vocabulary": catalan_set,
        "stop_words": stop_words_ca
    },
    "fr": {
        "vocabulary": french_set,
        "stop_words": stop_words_fr
    },
    "it": {
        "vocabulary": italian_set,
        "stop_words": stop_words_it
    },
    "en": {
        "vocabulary": english_set,
        "stop_words": stop_words_en
    },
    "pt": {
        # "vocabulary": portuguese_set,
        "stop_words": stop_words_pt
    },
    "non_identified": {
        "vocabulary": vocabulary_all,
        "stop_words": stop_words_all
    }
}

# Check quality for each txt file 

In [26]:
columns = ["revista", "publicacio", "idiomes", "es_perc", "ca_perc", "fr_perc", "it_perc", "en_perc", 
           "non_identified_perc", "es_existing", "ca_existing", "fr_existing", "it_existing", "en_existing",
           "non_identified_existing", "n_words", "total_existing"]

In [27]:
def read_and_process_file(df, path_publications, revista, revista_words, file):

    if file.endswith(".txt"):

        revista_words[file[:-4]] = {}
        file_words = revista_words[file[:-4]]

        print(revista, file, end=" -> ")

        # read text
        with open(os.path.join(path_publicacions, revista, file), "rb") as file_opener:
            raw_data = file_opener.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']

        try:
            with open(os.path.join(path_publicacions, revista, file), encoding=encoding) as file_opener:
                data_file = file_opener.read()
        except:
            with open(os.path.join(path_publicacions, revista, file), encoding="latin-1") as file_opener:
                data_file = file_opener.read()

        # preprocess the text: lowercase, remove weid char \x0c, dots and commas
        clean_data_file = data_file.lower().replace("\x0c", "").replace(".", "").replace(",", "") \
            .replace("d'", "").replace("(", "").replace(")", "")

        # language detection of each line
        text_lang = {"es":[], "ca":[], "it":[], "fr":[], "en":[], "non_identified":[]} # "pt":[], "de":[],

        data_file_lines = clean_data_file.split("\n")
        for line in data_file_lines:
            if any([c.isalpha() for c in line if len(c)>0]):
                try:
                    langs = detect_langs(line)
                    l_found = False
                    for l in langs:
                        l = str(l)[:2]
                        if (not l_found) and (l in text_lang.keys()):
                            text_lang[l].append(line)
                            l_found = True
                    if not l_found:
                        text_lang["non_identified"].append(line)
                except:
                    text_lang["non_identified"].append(line)

            else:
                text_lang["non_identified"].append(line)

        count_lang = {}
        count_existing = {}
        for lang, text in text_lang.items():

            file_words[lang] = {"existing": [], "non_existing": []}

            text_lang[lang] = " ".join(text)

            # split text into words and remove empty strings
            word_list_data_file = [s for s in text_lang[lang].split(" ") if s != ""]

            # remove stop words
            stop_words = vocabularies[lang]["stop_words"]
            word_list_data_file = [w for w in word_list_data_file if w not in stop_words]

            # count the number of words
            count_lang[lang] = len(word_list_data_file)

            # count existing words in the vocabulary
            vocabulary = vocabularies[lang]["vocabulary"]

            if lang not in count_existing:
                count_existing[lang] = 0

            for word in word_list_data_file:
                if word in vocabulary:
                    count_existing[lang] += 1
                    file_words[lang]["existing"].append(word)
                else:
                    file_words[lang]["non_existing"].append(word)

        # compute % of existing words in every language and in overall per document and save 
        # results in a dataframe
        row = {
            "revista": revista, 
            "publicacio": file, 
            "idiomes": [], 
            "n_words": sum(count_lang.values()), 
            "total_existing": round(np.divide(sum(count_existing.values()),sum(count_lang.values()))*100, 2)
              }

        for lang, count in count_lang.items():
            col_perc = lang+"_perc"
            percentage = round(np.divide(count, sum(count_lang.values()))*100,2)
            row[col_perc] = percentage
            if (percentage > 5) and (lang != "non_identified"):
                row["idiomes"].append(lang)

            col_existing = lang+"_existing"
            percentage = round(np.divide(count_existing[lang], count)*100,2)
            row[col_existing] = percentage

        print(row["total_existing"], "%")

        df.loc[len(df.index)] = row

In [28]:
def quality_check(path_publications, revistes=None):
    
    # initialize results dataframe
    df = pd.DataFrame(columns=columns)
    
    if revistes is None:
        publicacions_list = os.listdir(path_publicacions)
    else:
        publicacions_list = revistes
    
    publicacions_list.sort()
    
    name = path_publications.split("/")[-1]
    
    all_words = {}
    for revista in publicacions_list:
        all_words[revista] = {}
        revista_words = all_words[revista]
        
        docs_list = os.listdir(os.path.join(path_publicacions, revista))
        docs_list.sort()
        print()
        
        for file in docs_list:
            read_and_process_file(df, path_publications, revista, revista_words, file)
            
            df.to_csv(f"../data/processed/resultats/results_{name}.csv", index=False)
            json.dump(all_words, open(f"../data/processed/resultats/all_words_{name}.json", "w"))
            
    return df

In [30]:
def quality_check(path_publications, revistes=None):
    
    if revistes is None:
        publicacions_list = os.listdir(path_publicacions)
    else:
        publicacions_list = revistes
    
    publicacions_list.sort()
    
    for revista in publicacions_list:
        quality_check_revista(path_publications, revista)

In [29]:
def quality_check_revista(path_publications, revista):
    
    # initialize results dataframe
    df = pd.DataFrame(columns=columns)
    
    name = path_publications.split("/")[-1]
    
    all_words = {}
    all_words[revista] = {}
    revista_words = all_words[revista]

    docs_list = os.listdir(os.path.join(path_publicacions, revista))
    docs_list.sort()
    
    for file in docs_list:
        read_and_process_file(df, path_publications, revista, revista_words, file)

        df.to_csv(f"../data/processed/resultats/results_{name}_{revista}.csv", index=False)
        json.dump(all_words, open(f"../data/processed/resultats/all_words_{name}_{revista}.json", "w"))
            
    return df

In [97]:
path_publicacions = "../../Dades_revistes/Publicacions"
# results = quality_check(path_publicacions)



391 19170202.txt -> 62.23 %
391 19170303.txt -> 65.74 %
391 19170605.txt -> 68.22 %
391 19170706.txt -> 54.07 %
391 19200312.txt -> 4.66 %
391 19200713.txt -> 49.33 %
391 19201114.txt -> 47.4 %
391 19240516.txt -> 28.29 %
391 19241019.txt -> 36.63 %

AC 193101.txt -> 62.51 %
AC 193102.txt -> nan %
AC 193103.txt -> nan %
AC 193104.txt -> nan %
AC 193205.txt -> nan %
AC 193206.txt -> nan %
AC 193207.txt -> 58.13 %
AC 193208.txt -> 51.55 %
AC 193309.txt -> 64.38 %
AC 193310.txt -> 62.4 %
AC 193311.txt -> 60.15 %
AC 193312.txt -> 61.28 %
AC 193413.txt -> 73.11 %
AC 193414.txt -> 70.43 %
AC 193415.txt -> 73.24 %
AC 193416.txt -> 76.84 %
AC 193517.txt -> 69.74 %
AC 193518.txt -> 75.91 %

Fulls grocs 19291201.txt -> 69.46 %

Proa Untitled2.txt -> 67.7 %

algol 1947.txt -> 83.53 %

amicarts 19260401.txt -> 69.8 %
amicarts 19260502.txt -> 70.32 %
amicarts 19260603.txt -> 67.25 %
amicarts 19260704.txt -> 69.19 %
amicarts 19260805.txt -> 68.73 %
amicarts 19260906.txt -> 69.98 %
amicarts 19261007

In [41]:
## afegim les 5 publicacions de AC que estaven buides
path_publicacions = "../../Dades_revistes/Publicacions"
revista = "AC"
quality_check_revista(path_publicacions, revista)

AC 193101.txt -> 62.54 %
AC 193102.txt -> 53.11 %
AC 193103.txt -> 45.21 %
AC 193104.txt -> 48.43 %
AC 193205.txt -> 32.22 %
AC 193206.txt -> 52.01 %
AC 193207.txt -> 58.05 %
AC 193208.txt -> 51.54 %
AC 193309.txt -> 64.59 %
AC 193310.txt -> 62.32 %
AC 193311.txt -> 59.69 %
AC 193312.txt -> 61.41 %
AC 193413.txt -> 73.13 %
AC 193414.txt -> 70.5 %
AC 193415.txt -> 73.37 %
AC 193416.txt -> 76.95 %
AC 193517.txt -> 69.59 %
AC 193518.txt -> 75.94 %


Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,AC,193101.txt,"[es, ca]",76.41,12.11,1.05,1.51,1.22,7.7,77.03,8.99,10.91,11.39,25.0,26.05,5235,62.54
1,AC,193102.txt,"[es, ca]",71.58,15.65,0.98,3.45,0.48,7.85,69.12,10.97,7.94,5.88,9.68,20.28,6407,53.11
2,AC,193103.txt,"[es, ca]",60.88,20.93,1.01,3.88,1.34,11.96,67.78,7.38,11.67,9.96,10.0,14.75,5954,45.21
3,AC,193104.txt,"[es, ca]",67.84,17.31,1.48,3.82,1.39,8.17,64.94,11.97,15.38,12.82,12.94,17.17,6131,48.43
4,AC,193205.txt,"[es, ca, it]",43.84,36.4,0.88,5.01,1.17,12.7,63.64,5.83,11.43,8.82,11.83,11.93,7924,32.22
5,AC,193206.txt,"[es, ca]",69.77,15.19,1.14,3.41,1.06,9.42,68.82,6.92,9.78,12.68,15.12,23.75,8089,52.01
6,AC,193207.txt,"[es, ca]",74.85,11.76,0.88,4.1,0.97,7.45,72.22,12.74,20.55,10.91,12.5,23.5,8277,58.05
7,AC,193208.txt,"[es, ca]",63.6,15.28,1.43,3.41,1.16,15.12,72.5,8.62,15.56,9.3,15.07,22.46,6302,51.54
8,AC,193309.txt,"[es, ca]",80.93,8.08,0.58,1.65,0.68,8.07,76.23,10.94,23.64,14.01,13.85,19.3,9501,64.59
9,AC,193310.txt,"[es, ca]",77.85,9.89,1.24,2.11,0.22,8.7,75.1,14.08,27.18,16.48,11.11,20.14,8333,62.32


In [107]:
results = pd.read_csv("resultats/results_Publicacions.csv")
results

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,391,19170202.txt,['fr'],0.0,3.57,84.96,0.11,0.54,10.82,,27.27,70.57,0.0,40.0,10.0,924,62.23
1,391,19170303.txt,['fr'],0.3,0.0,94.14,0.3,0.66,4.6,33.33,,69.19,0.0,38.46,5.49,1979,65.74
2,391,19170605.txt,"['fr', 'en']",0.9,1.55,86.14,0.8,6.31,4.3,66.67,19.35,71.7,12.5,72.22,20.93,1998,68.22
3,391,19170706.txt,['fr'],0.0,1.16,70.35,2.91,4.36,21.22,,25.0,73.97,0.0,26.67,2.74,344,54.07
4,391,19200312.txt,"['ca', 'it', 'fr']",0.0,28.94,36.37,6.03,2.65,26.02,,3.47,6.14,0.0,10.19,4.43,4078,4.66
5,391,19200713.txt,['fr'],0.63,3.94,84.6,2.51,0.54,7.79,42.86,11.36,56.72,7.14,0.0,5.75,1117,49.33
6,391,19201114.txt,"['ca', 'fr']",0.58,5.28,80.71,0.66,2.39,10.39,28.57,14.06,55.06,0.0,48.28,8.73,1213,47.4
7,391,19240516.txt,"['ca', 'fr']",0.0,6.44,43.14,0.0,3.36,47.06,,8.7,59.09,,25.0,2.98,357,28.29
8,391,19241019.txt,['fr'],2.04,3.07,75.47,0.0,0.51,18.91,33.33,5.56,46.95,,0.0,1.8,587,36.63
9,AC,193101.txt,"['es', 'ca']",76.67,12.03,1.15,1.74,1.01,7.41,76.82,8.89,11.67,10.99,30.19,25.77,5239,62.51


In [42]:
results[results["revista"]!="arcvoltaic"]["n_words"].mean()

7118.341404358353

In [90]:
## merge all_words_json of all publications with AC
all_words_bloc1 = json.load(open("../data/processed/resultats/all_words_bloc1.json", "r"))
all_words_bloc1_ac = json.load(open("../data/processed/resultats/all_words_Publicacions_AC.json", "r"))

In [88]:
all_words_bloc1["AC"] = all_words_bloc1_ac["AC"]

In [89]:
#json.dump(all_words_bloc1, open(f"../data/processed/resultats/all_words_bloc1.json", "w"))

In [92]:
results_bloc1 = pd.read_csv("../data/processed/resultats/results_bloc1.csv")
results_bloc1_ac = pd.read_csv("../data/processed/resultats/results_Publicacions_AC.csv")

In [94]:
results_bloc1[results_bloc1["revista"]=="AC"]

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
9,AC,193101.txt,"['es', 'ca']",76.67,12.03,1.15,1.74,1.01,7.41,76.82,8.89,11.67,10.99,30.19,25.77,5239,62.51
10,AC,193102.txt,[],,,,,,,,,,,,,0,
11,AC,193103.txt,[],,,,,,,,,,,,,0,
12,AC,193104.txt,[],,,,,,,,,,,,,0,
13,AC,193205.txt,[],,,,,,,,,,,,,0,
14,AC,193206.txt,[],,,,,,,,,,,,,0,
15,AC,193207.txt,"['es', 'ca']",74.75,12.46,0.94,3.52,0.99,7.34,72.37,12.71,21.79,11.68,12.2,23.39,8274,58.13
16,AC,193208.txt,"['es', 'ca']",64.25,14.89,1.41,3.33,0.94,15.18,72.13,8.74,15.73,9.52,16.95,21.13,6299,51.55
17,AC,193309.txt,"['es', 'ca']",80.73,8.34,0.54,2.19,0.59,7.62,76.19,10.59,25.49,12.02,14.29,19.75,9505,64.38
18,AC,193310.txt,"['es', 'ca']",78.1,9.89,1.27,1.84,0.34,8.56,75.12,13.71,25.47,15.03,10.71,20.34,8328,62.4


In [95]:
results_bloc1_ac

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,AC,193101.txt,"['es', 'ca']",76.41,12.11,1.05,1.51,1.22,7.7,77.03,8.99,10.91,11.39,25.0,26.05,5235,62.54
1,AC,193102.txt,"['es', 'ca']",71.58,15.65,0.98,3.45,0.48,7.85,69.12,10.97,7.94,5.88,9.68,20.28,6407,53.11
2,AC,193103.txt,"['es', 'ca']",60.88,20.93,1.01,3.88,1.34,11.96,67.78,7.38,11.67,9.96,10.0,14.75,5954,45.21
3,AC,193104.txt,"['es', 'ca']",67.84,17.31,1.48,3.82,1.39,8.17,64.94,11.97,15.38,12.82,12.94,17.17,6131,48.43
4,AC,193205.txt,"['es', 'ca', 'it']",43.84,36.4,0.88,5.01,1.17,12.7,63.64,5.83,11.43,8.82,11.83,11.93,7924,32.22
5,AC,193206.txt,"['es', 'ca']",69.77,15.19,1.14,3.41,1.06,9.42,68.82,6.92,9.78,12.68,15.12,23.75,8089,52.01
6,AC,193207.txt,"['es', 'ca']",74.85,11.76,0.88,4.1,0.97,7.45,72.22,12.74,20.55,10.91,12.5,23.5,8277,58.05
7,AC,193208.txt,"['es', 'ca']",63.6,15.28,1.43,3.41,1.16,15.12,72.5,8.62,15.56,9.3,15.07,22.46,6302,51.54
8,AC,193309.txt,"['es', 'ca']",80.93,8.08,0.58,1.65,0.68,8.07,76.23,10.94,23.64,14.01,13.85,19.3,9501,64.59
9,AC,193310.txt,"['es', 'ca']",77.85,9.89,1.24,2.11,0.22,8.7,75.1,14.08,27.18,16.48,11.11,20.14,8333,62.32


In [99]:
results_bloc1 = results_bloc1.set_index(["revista", "publicacio"])
results_bloc1_ac = results_bloc1_ac.set_index(["revista", "publicacio"])

results_bloc1.update(results_bloc1_ac)
results_bloc1 = results_bloc1.reset_index()

In [103]:
results_bloc1[results_bloc1["revista"]=="AC"]

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
9,AC,193101.txt,"['es', 'ca']",76.41,12.11,1.05,1.51,1.22,7.7,77.03,8.99,10.91,11.39,25.0,26.05,5235,62.54
10,AC,193102.txt,"['es', 'ca']",71.58,15.65,0.98,3.45,0.48,7.85,69.12,10.97,7.94,5.88,9.68,20.28,6407,53.11
11,AC,193103.txt,"['es', 'ca']",60.88,20.93,1.01,3.88,1.34,11.96,67.78,7.38,11.67,9.96,10.0,14.75,5954,45.21
12,AC,193104.txt,"['es', 'ca']",67.84,17.31,1.48,3.82,1.39,8.17,64.94,11.97,15.38,12.82,12.94,17.17,6131,48.43
13,AC,193205.txt,"['es', 'ca', 'it']",43.84,36.4,0.88,5.01,1.17,12.7,63.64,5.83,11.43,8.82,11.83,11.93,7924,32.22
14,AC,193206.txt,"['es', 'ca']",69.77,15.19,1.14,3.41,1.06,9.42,68.82,6.92,9.78,12.68,15.12,23.75,8089,52.01
15,AC,193207.txt,"['es', 'ca']",74.85,11.76,0.88,4.1,0.97,7.45,72.22,12.74,20.55,10.91,12.5,23.5,8277,58.05
16,AC,193208.txt,"['es', 'ca']",63.6,15.28,1.43,3.41,1.16,15.12,72.5,8.62,15.56,9.3,15.07,22.46,6302,51.54
17,AC,193309.txt,"['es', 'ca']",80.93,8.08,0.58,1.65,0.68,8.07,76.23,10.94,23.64,14.01,13.85,19.3,9501,64.59
18,AC,193310.txt,"['es', 'ca']",77.85,9.89,1.24,2.11,0.22,8.7,75.1,14.08,27.18,16.48,11.11,20.14,8333,62.32


In [104]:
#results_bloc1.to_csv(f"../data/processed/resultats/results_bloc1.csv", index=False)

### Bloc 2

In [99]:
path_publicacions = "../../Dades_revistes_2"
results = quality_check(path_publicacions)


01 La Publicitat 3944.txt -> 23.52 %

02 La Veu de Catalunya 19130306-25232-11.txt -> 33.3 %

03 Mirador 16.txt -> 41.69 %

04 Gaseta de les Arts 17.txt -> 56.43 %

05 Daci dalla 19180501-5640.txt -> 44.98 %

06 Esquella torratxa 19080515-33442.txt -> 33.46 %

07 Cu cut 19060510-4181.txt -> 37.08 %

08 Papitu 19150310-4462.txt -> 38.34 %

09 Borinot 209.txt -> 70.39 %

10 Vell i nou 447.txt -> 49.76 %

11 Arts i bells oficis 63.txt -> 54.77 %

12 Revista nova 65.txt -> 60.37 %

13 Nova Revista 19270301-2855.txt -> 68.42 %

14 Revista 299.txt -> 62.11 %

15 Picarol 15.txt -> 57.72 %

16 Joventut 19030409-3861.txt -> 38.7 %

17 Hispania 137.txt -> 68.78 %

18 Art publicacio 27.txt -> 55.05 %

19 Art revista 19330101-74.txt -> 36.94 %


In [100]:
results

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,01 La Publicitat,3944.txt,[ca],2.63,85.95,1.67,3.29,1.02,5.44,30.12,25.26,9.46,4.08,9.09,11.55,33563,23.52
1,02 La Veu de Catalunya,19130306-25232-11.txt,[ca],2.22,93.67,0.95,1.05,0.26,1.86,33.93,34.11,7.92,10.94,12.31,20.68,25261,33.3
2,03 Mirador,16.txt,[ca],1.66,91.27,2.45,0.79,0.19,3.64,35.71,44.03,13.53,5.97,37.5,12.7,8441,41.69
3,04 Gaseta de les Arts,17.txt,[ca],2.15,92.69,1.33,0.78,0.31,2.74,41.82,58.77,26.47,10.0,25.0,20.0,2559,56.43
4,05 Daci dalla,19180501-5640.txt,[ca],2.6,92.38,1.15,1.33,0.33,2.21,42.93,46.78,10.5,5.74,15.38,18.1,15757,44.98
5,06 Esquella torratxa,19080515-33442.txt,"[es, ca]",12.43,82.26,1.2,1.18,0.28,2.64,36.44,34.33,14.29,5.45,0.0,17.07,4657,33.46
6,07 Cu cut,19060510-4181.txt,"[es, ca]",6.6,83.28,1.3,3.25,0.36,5.2,27.95,40.87,8.0,3.2,28.57,17.0,3846,37.08
7,08 Papitu,19150310-4462.txt,"[es, ca]",6.23,85.72,1.65,1.74,0.74,3.91,46.63,39.56,17.65,9.26,17.39,23.97,3096,38.34
8,09 Borinot,209.txt,[ca],0.68,94.0,0.8,1.45,0.68,2.38,72.73,73.3,26.92,8.51,4.55,25.97,3232,70.39
9,10 Vell i nou,447.txt,[ca],1.98,87.76,4.0,1.58,1.55,3.13,49.28,53.55,14.39,10.91,22.22,22.02,3479,49.76


In [36]:
## revistes del bloc 2 proporcionades de la BNC el 07/07
path_publicacions = "../../Dades_revistes_2/Publicacions_BC"
results = quality_check(path_publicacions)


Art junta de museus 1933v101.pdf.txt -> 70.91 %
Art junta de museus 1933v102.pdf.txt -> 65.64 %
Art junta de museus 1933v103.pdf.txt -> 72.11 %
Art junta de museus 1934v104.pdf.txt -> 70.37 %
Art junta de museus 1934v105.pdf.txt -> 72.26 %
Art junta de museus 1934v106.pdf.txt -> 72.99 %
Art junta de museus 1934v107.pdf.txt -> 69.82 %
Art junta de museus 1934v108.pdf.txt -> 68.94 %
Art junta de museus 1934v109.pdf.txt -> 74.58 %
Art junta de museus 1934v110.pdf.txt -> 65.35 %
Art junta de museus 1934v201.pdf.txt -> 56.84 %
Art junta de museus 1934v202.pdf.txt -> 51.86 %
Art junta de museus 1934v203.pdf.txt -> 53.92 %
Art junta de museus 1935v204.pdf.txt -> 60.9 %
Art junta de museus 1935v205.pdf.txt -> 62.4 %
Art junta de museus 1935v206.pdf.txt -> 66.99 %
Art junta de museus 1935v207.pdf.txt -> 65.05 %
Art junta de museus 1935v208.pdf.txt -> 58.69 %
Art junta de museus 1935v209.pdf.txt -> 51.05 %
Art junta de museus 1935v210.pdf.txt -> 61.26 %
Art junta de museus 1936v301.pdf.txt -> 6

In [32]:
## revistes del bloc 2 proporcionades de la BNC el 20/07
path_publicacions = "../../Dades_revistes/publicacions_bloc2"
revistes = ['Revista', 'Revista nova']
quality_check(path_publicacions, revistes)

Revista 19150501.pdf.txt -> 64.84 %
Revista 19150502.pdf.txt -> 62.05 %
Revista 19150703.pdf.txt -> 65.5 %
Revista 19150804.pdf.txt -> 61.74 %
Revista 19150905.pdf.txt -> 62.57 %
Revista 19151206.pdf.txt -> 62.38 %
Revista 19160107.pdf.txt -> 59.6 %
Revista 19160108.pdf.txt -> 59.72 %
Revista 19160209.pdf.txt -> 60.53 %
Revista 19160210.pdf.txt -> 60.5 %
Revista 19160311.pdf.txt -> 58.96 %
Revista 19160312.pdf.txt -> 64.51 %
Revista 19160313.pdf.txt -> 62.5 %
Revista 19160414.pdf.txt -> 61.64 %
Revista 19160515.pdf.txt -> 61.2 %
Revista 19160516.pdf.txt -> 61.97 %
Revista 19160617.pdf.txt -> 61.66 %
Revista 19160618.pdf.txt -> 59.94 %
Revista 19160719.pdf.txt -> 62.12 %
Revista 19160720.pdf.txt -> 63.27 %
Revista 19160821.pdf.txt -> 62.51 %
Revista 19160822.pdf.txt -> 63.7 %
Revista 19160923.pdf.txt -> 58.88 %
Revista 19160924.pdf.txt -> 60.05 %
Revista 19161025.pdf.txt -> 62.08 %
Revista 19161026.pdf.txt -> 59.42 %
Revista 19161127.pdf.txt -> 61.17 %
Revista 19161128.pdf.txt -> 60.94 

Revista nova 19140716.pdf.txt -> 55.18 %
Revista nova 19140717.pdf.txt -> 51.26 %
Revista nova 19140818.pdf.txt -> 53.77 %
Revista nova 19140819.pdf.txt -> 52.95 %
Revista nova 19140820.pdf.txt -> 55.35 %
Revista nova 19140821.pdf.txt -> 55.14 %
Revista nova 19140922.pdf.txt -> 55.79 %
Revista nova 19140923.pdf.txt -> 50.97 %
Revista nova 19140924.pdf.txt -> 54.77 %
Revista nova 19140925.pdf.txt -> 56.59 %
Revista nova 19141026.pdf.txt -> 55.1 %
Revista nova 19141027.pdf.txt -> 52.33 %
Revista nova 19141028.pdf.txt -> 53.37 %
Revista nova 19141029.pdf.txt -> 55.36 %
Revista nova 19141030.pdf.txt -> 52.67 %
Revista nova 19141131.pdf.txt -> 44.06 %
Revista nova 19160532.pdf.txt -> 61.48 %
Revista nova 19160533.pdf.txt -> 61.42 %
Revista nova 19160634.pdf.txt -> 60.33 %
Revista nova 19160635.pdf.txt -> 58.43 %
Revista nova 19160736.pdf.txt -> 60.48 %
Revista nova 19160737.pdf.txt -> 56.89 %
Revista nova 19160838.pdf.txt -> 59.39 %
Revista nova 19160839.pdf.txt -> 58.26 %
Revista nova 1916

In [72]:
results = pd.read_csv("../data/processed/resultats/results_Publicacions_BC.csv")

In [75]:
results_no_index = results[~results["publicacio"].str.contains("index")]
results_index = results[results["publicacio"].str.contains("index")]

In [76]:
results_index

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
168,Vell i nou,1915index.pdf.txt,"['es', 'ca', 'it']",11.21,31.03,4.08,8.02,0.65,45.02,54.67,63.88,20.73,10.56,53.85,31.42,2008,42.13
193,Vell i nou,1916index.pdf.txt,"['es', 'ca', 'fr']",9.05,42.27,7.1,4.09,1.18,36.31,51.48,57.9,15.05,9.35,38.71,15.25,2619,36.58
213,Vell i nou,1917index.pdf.txt,"['es', 'ca', 'fr']",10.95,46.78,7.44,5.0,1.12,28.71,54.33,61.18,17.76,14.37,35.9,14.81,3480,41.26
238,Vell i nou,1918index.pdf.txt,"['es', 'ca', 'it']",11.88,38.06,3.34,8.37,1.34,37.02,32.48,28.02,5.19,4.15,9.68,8.43,2307,18.29
261,Vell i nou,1919index.pdf.txt,"['es', 'ca', 'it']",9.47,44.05,2.75,8.47,0.99,34.27,52.42,50.09,13.89,9.01,23.08,9.35,1310,31.6
271,Vell i nou,1920index.pdf.txt,"['es', 'ca', 'it', 'fr']",22.76,22.29,8.79,6.77,0.85,38.54,73.19,60.28,39.57,20.56,29.63,11.89,3163,39.8


In [56]:
def q25(x):
    return x.quantile(.25)
def q50(x):
    return x.quantile(.5)
def q75(x):
    return x.quantile(.75)

In [74]:
results_no_index[["revista","total_existing"]].groupby(["revista"]).agg([np.min, q25, q50, q75, np.max, np.mean])

Unnamed: 0_level_0,total_existing,total_existing,total_existing,total_existing,total_existing,total_existing
Unnamed: 0_level_1,amin,q25,q50,q75,amax,mean
revista,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Art junta de museus,51.05,60.9,65.64,70.37,74.58,64.737143
Gaseta de les Arts,45.81,56.0975,57.805,59.8475,68.1,58.11837
Nova revista,36.76,58.845,63.09,67.6425,73.24,61.71625
Vell i nou,50.39,55.4725,57.05,59.4775,70.43,57.715308


### Comparació BNC i Adobe

In [101]:
path_publicacions = "../../comparacio_bnc_adobe"
results = quality_check(path_publicacions)


esportcat 19250404_Adobe.txt -> 61.9 %
esportcat 19250404_BNC.txt -> 71.37 %

monitor 19210404_Adobe.txt -> 58.17 %
monitor 19210404_BNC.txt -> 69.73 %


In [102]:
results

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,esportcat,19250404_Adobe.txt,[ca],0.82,95.69,2.24,0.36,0.21,0.68,36.54,63.12,40.85,8.7,38.46,25.58,6338,61.9
1,esportcat,19250404_BNC.txt,"[es, ca]",5.45,88.46,2.61,0.73,0.88,1.88,60.34,75.11,21.18,8.42,21.74,44.08,13051,71.37
2,monitor,19210404_Adobe.txt,"[ca, fr]",0.79,86.0,11.89,0.55,0.21,0.56,38.1,60.7,45.48,6.82,23.53,31.11,8009,58.17
3,monitor,19210404_BNC.txt,"[es, ca, fr, en]",12.11,68.13,6.39,1.48,10.75,1.13,71.41,73.76,56.02,12.12,62.19,33.9,15595,69.73


### Comparacio BNC i tesseract


In [28]:
path_publicacions = "../../tesseract"
results_tesseract = quality_check(path_publicacions)


esportscat 19205117_bnc.txt -> 64.67 %
esportscat 19205117_tesseract.txt -> 54.1 %

matrencada 19241101_bnc.txt -> 34.48 %
matrencada 19241101_tesseract.txt -> 52.01 %
matrencada 19241102_bnc.txt -> 24.29 %
matrencada 19241102_tesseract.txt -> 49.92 %
matrencada 19241203_bnc.txt -> 24.49 %
matrencada 19241203_tesseract.txt -> 52.12 %
matrencada 19241204_bnc.txt -> 35.22 %
matrencada 19241204_tesseract.txt -> 52.72 %
matrencada 19250105_bnc.txt -> 20.67 %
matrencada 19250105_tesseract.txt -> 47.85 %
matrencada 19250106_bnc.txt -> 26.32 %
matrencada 19250106_tesseract.txt -> 50.76 %


## Results exploration

In [115]:
all_words = json.load(open("resultats/all_words_Publicacions.json", "r"))

In [116]:
exist = all_words["AC"]["193101"]["ca"]["existing"]
non_exist = all_words["AC"]["193101"]["ca"]["non_existing"]

In [117]:
print(exist)

['pintura', 'llar', 'rambla', 'sauret', 'casa', 'generals', 'fusta', 'barcelona', 'decorador', 'carrer', 'vi', 'societat', 'limitada', 'rambla', 'catalunya', 'balmes', 'os', 'barcelona', 'caso', 'casa', 'casa', 'planta', 'san', 'con', 'toda', 'hotel', 'individual', 'oms', 'gracia', 'torres', 'internacional', 'calle', 'siemens', 'para', 'uralita', 'rigol', 'viuda', 'acero', 'arquitectura', 'barcelona', 'taller', 'pintura', 'para', 'calle', 'carrer', 'construeixen', 'actualment', 'models', 'adequades', 'cases', 'mes', 'refinada', 'las', 'oficina', 'olot', 'barcelona']


In [118]:
print(non_exist)

['y', 'r', 'q', 'u', 'tectura', '*', 'u', 'r', 'b', 'n', 'z', 'c', 'ón', 'ceuta', 't', 'p', 'c', 's', 'c', 'l', 'n', 't', 'd', 'r', 's', 'c', 'u', 'r', 't', 's', 'b', 'ño', 'cataluña', 'núm', '32', 'j', 'm', 'm', 'u', 'b', 'l', 's', 't', '/', 'f', 'u', 'n', 'd', 'd', '1', '8', '54', 'teléf', '72957', 'f', 'perales', 'g', 'u', 'x', 'r', 'valencia', '362', 'b', 'r', 'c', 'l', 'n', 'iaró', 's', 'c', 'd', 'd', 'n', 'ó', 'n', '129', '•', 'teléfon', '73536', 'b', 'r', 'c', 'l', 'n', 'n', "'", '9', 'a^', 'd', 's', 'p', 'c', 'h', ':', 'b', 'r', 'c', 'l', 'n', 'v', 'd', 'r', 'x', 'p', 's', 'c', 'ó', 'n', 'r', 'q', 'u', 't', 'c', 't', 'u', 'r', 'y', 'catalupía', 'v', 'l', 'r', 'ó', 'b', 'r', 'c', 'l', 'n', 'p', 'l', 'n', 't', 'v', 'l', 'r', 'ó', 'b', 'r', 'c', 'l', 'n', 'terrazo', 'v', 'l', 'r', 'ó', 'b', 'r', 'c', 'l', 'n', 'sección', 'x', 'p', 's', 'c', 'ó', 'n', 'r', 'q', 'u', 't', 'c', 't', 'u', 'r', 'y', 'escuelas', 'b', 'r', 'r', 'fachada', 'p', 'r', 'n', 'c', 'p', 'l', 'l', '^', 'elementa

In [119]:
exist = all_words["AC"]["193101"]["es"]["existing"]
non_exist = all_words["AC"]["193101"]["es"]["non_existing"]

In [120]:
print(exist)

['documentos', 'actividad', 'contemporánea', 'publicación', 'exposición', 'puertas', 'barcelona', 'san', 'pol', 'mar', 'barcelona', 'habitaciones', 'ciudad', 'verde', 'moscú', 'bibliografía', 'requiere', 'instalacion', 'eléctrica', 'axioma', 'edificio', 'inversión', 'inversión', 'atiende', 'obtener', 'máximo', 'rendimiento', 'esencial', 'adelantos', 'perfeccionamientos', 'contribuir', 'rodearla', 'garantías', 'ponemos', 'disposición', 'darle', 'compromiso', 'detalles', 'completos', 'presupuestos', 'instalaciones', 'eléctricas', 'empotradas', 'fuerza', 'ventilación', 'calefacción', 'iluminación', 'refrigeración', 'electricidad', 'cortes', 'barcelona', 'pelayo', 'casa', 'moderna', 'tos', 'presupuestos', 'instrucciones', 'calle', 'baños', 'núm', 'teléfono', 'núm', 'barcelona', 'muebles', 'decoración', 'alfombras', 'telas', 'jouy', 'cortinajes', 'pasamanería', 'bañeras', 'lavabos', 'casa', 'teléfono', 'barcelona', 'barcelona', 'consejo', 'ciento', 'tel', 'cercados', 'metálicos', 'variedade

In [121]:
print(non_exist)

['g', 't', 'p', 'c', '*', 'modernasde', '*', '"standard"', '*', '*', '*', 'fotografía-cine', '*', '*', 'ediricacion', '/aodedria', '"cae', 'peso"', 'v', 'l', 'r', 'i', 'z', 'r', 'l', 'aluinbrado', '»', '»', '»', 'anglo-españoia', 's', '525', '*', '•', '12', 'proyec', 'estudio:', '5', '-', '20128', '-', 's', 't', 'r', 's', '•', 'central:', '1747ó', '7pelayo7', '329', '-', '14657-', 'barcelon', 'barcelona:', '58', 'madrid:', '4', 'construccions', 'tordera', '40', 'b', 'nzes', '•', 'lampares', 'gamande', '15', 'hotel-restauront', 'rmoles', 'blas!', 'despacho:', '154', '-', '52880', 'fábrica:', '553', '-teléfono', '51032', 'b', 'r', 'c', 'l', 'n', 'cielo-rasos', 'parcerisas', 'c"', ':', ':', '88', ':', '72573', 'gavá', 'despacho:', '18-teléfono', '54271', "bo'io", '\\a', 'cupresus', 'coniferos', 'borduras', ':', 'lllescos', '"', 'g', 't', 'p', 'c', '-', '-', 'redac', 'adm:', 'pl', 'margau', '99', '-', '1"', 'suscripción:', '10', 'región;', '¡guales', 'adoptarlos', '—', '—', 'elementos-tipo

### Tesseract

In [30]:
all_words = json.load(open("resultats/all_words_tesseract.json", "r"))

In [32]:
exist_bnc = all_words["matrencada"]["19241101_bnc"]["ca"]["existing"]
non_exist_bnc = all_words["matrencada"]["19241101_bnc"]["ca"]["non_existing"]

exist_tesseract = all_words["matrencada"]["19241101_tesseract"]["ca"]["existing"]
non_exist_tesseract = all_words["matrencada"]["19241101_tesseract"]["ca"]["non_existing"]

In [36]:
print(exist_bnc)

['ada', 'arts', 'joan', 'ampit', 'manera', 'forma', 'intenció', "se'n", 'mes', 'forma', 'duresa', 'escultura', 'escultura', 'forma', 'trobar', 'insuperable', 'perícies', 'anomenar', 'romàntica', 'paraula', 'arribat', 'moments', 'suggestió', 'emoció', 'tingut', 'mai', 'escultura', 'grega', 'viu', 'avui', 'dia', 'escultor', 'nat', 'escultor', 'obra', 'tres', 'escultor', 'conscientment', 'inconscientment', 'segueix', 'orientació', 'varen', 'ela', 'bona', 'època', 'forma', 'escultura', 'forma', 'bella', 'necessita', 'expressió', 'anima', 'forma', 'grega', 'cercat', 'tipus', 'animal', 'sentit', 'racial', 'mi', 'sembla', 'perfecte', 'sabut', 'traduir', 'pedra', 'tremolor', 'dignitat', 'forma', 'externa', 'prescindint', 'intenció', 'sentit', 'gran', 'clàssic', 'millor', 'elogi', 'tic', 'ca', 'lo', 'gràcia', 'poeta', 'nombre', 'home', 'ell', 'lema', 'camí', 'mes', 'burgès', 'burgesia', 'catalana', 'eu', 'nova', 'feia', 'escrivia', 'poesia', 'immediata', 'ell', 'esverava', 'poble', 'gosadies', 

In [35]:
print(exist_tesseract)

['mà', 'trencam', 'enric', 'segona', 'hagut', 'du', 'menes', 'escultura', 'escultura', 'intenció', 'forma', 'escultura', 'intenció', "se'n", 'enllà', 'forma', 'duresa', 'escultura', 'forma', 'trobar', 'solució', 'perfecta', 'gracia', 'escultura', 'anomenar', 'romàntica', 'paraula', 'arribat', 'moments', 'culminant', 'tingut', 'mai', 'grega', 'via', 'avui', 'dia', 'nat', 'obra', 'gràcia', 'escultor', 'segueix', 'orientació', 'escultòric', 'varen', 'grecs', 'bona', 'època', 'forma', 'escultura', 'forma', 'bel', 'quantitat', 'emotiva', 'expressió', 'anima', 'forma', 'exalta', 'vellesa', 'intensitat', 'mi', 'gràcia', 'ànima', 'cercat', 'bellesa', 'animal', 'sentit', 'racial', 'sembla', 'perfecte', 'sabut', 'pedra', 'tremolor', 'dignitat', 'externa', 'prescindint', 'intenció', 'sentit', 'ela', 'mort', 'joan', 'cloent', 'obra', 'just', 'obra', 'iniciada', 'ai', 'to', 'admirable', 'gràcia', 'um', 'cert', 'nombre', 'paradoxes', 'ell', 'home', 'ell', 'patit', 'moviments', 'tenia', 'camí', 'mes'

In [37]:
print(non_exist_bnc)

['\\l\\', 'tkk\\', 'oooç', 'slsgoc^jl', 'rtvista', 'quinzbnai', 'oe', 'tdtes', 'mielom', 'mtrll', 'barcelma', 'ti', 'dr', 'nowmbrr', '1924', 'anyi', '•', 'númtro', '•aohgoh', 'u', 'jr', 'vcur·', '<|umií^', 'h^gut', 'dugun', 'mcnc«', 'enullura:', 'cutuilura', 'inlcn', 'ia', '>|uc', "l'atura", 'cicullura', 'cniu', 'aifuesta', 'éi', 'alemmvs', 'anumcn-n', 'espressionista', "s'atura", '«olució', 'perfecti', 'í', 'u', 'gr·cia', "l'altra", 'e»cullura', 'é', '[hidriem', 'critlina', 'no-grega', 'eulminanli', 'plenitul', "l'equilibri", 'kntre', 'gricia', 'gràcies;', "l'knric", 'casanovas', "l'únií", 'uquella', '«', "l'ari", 'rscullòric', 'tl»nar-li', 'grr', '»', '<lr', "l'inlcnció", 'dc', 're«cultura', 'caunovai', "t'alura", 'c«', 'anti-eiprcssionisla', 'c«', 'i|ue', 'tc', '|>er', 'mateita', 'lanta', 'quanlilal', 'emuliva', "l'esperit", "l'escultura", "l'escultura", 'casanovas', "s'exalta", 'l·ellesa', "l'animal", "l'intensilat", 'grücia', "l'àninia", 'kn', 'casanovas', 'belles*', 'í', 'purissi

In [38]:
print(non_exist_tesseract)

['—', 'amyl—', 'núm', 't', 'casanovas', 'mn:', 'méva', 'madera', 'vere', 'uentió', "v'atura", 'l', 'x', 'anomenzn', 'expressiota', 'escallura', "s'atura", "i'insuperable", 'pericles', "l'altra", 'eristina', 'sup', 'plenitut', "l'equilibri", 'esculdura', 'esullor', 'casanovas', "l'ànic", 'conselentment', 'ultor', 'àcies:', "l'enric", 'incomcientment', "l'art", 'donar-li', '8', "l'escultura", "l'ialenci", 'casanovas', "s'atura", 'antiexpressioniata:', 'neremita', "l'esperit", "l'escultura", 'gre', "l'escultura", 'carmovas', "l'animal", 'casimovàs', 'ba', 'tipús', 'puríssim', 'miem', 'tradurr', 'lorma', "l'expressió", 'xò', 'inillor', 'salvat-papasseit', 'gnida', 'iniciado', 'ide', 'sinemit', 'plavia', 'daser', 'suburbi:', 'interringat', 'proletaris:', "l'ànima", 'foa', 'atistociàcia', 'unava', 'metodes', 'plicats', 'pocsi:', 'caligrases', 'erivia', 'soviat', 'extravagantsi', 'pocsia', 'aenailla', 'esserava', 'foament', 'papasseit', 'promema', 'salvat-papasseit', 'calligrames', 'actitut',

In [39]:
exist_bnc = all_words["esportscat"]["19205117_bnc"]["ca"]["existing"]
non_exist_bnc = all_words["esportscat"]["19205117_bnc"]["ca"]["non_existing"]

exist_tesseract = all_words["esportscat"]["19205117_tesseract"]["ca"]["existing"]
non_exist_tesseract = all_words["esportscat"]["19205117_tesseract"]["ca"]["non_existing"]

In [40]:
print(exist_bnc)

['carrer', 'bàrbara', 'plaça', 'catalunya', 'telèfons', 'pesseta', 'mes', 'barcelona', 'maig', 'número', 'cis', 'selecció', 'partit', 'començat', 'gran', 'tren', 'decaure', 'autor', 'ires', 'perelló', 'varen', 'gran', 'exhibició', 'ces', 'hom', 'posat', 'destarotament', 'presidi', 'aigua', 'freda', 'catalana', 'constituí', 'centre', 'dubtem', 'roig', 'bon', 'barcelona', 'campionat', 'peninsular', 'jugador', 'vist', 'jugar', 'vegades', 'partit', 'ahir', 'seleccions', 'catalunya', 'creiem', 'xic', 'verd', 'acabà', 'interessar', 'encontres', 'públic', 'diem', 'aplaudint', 'iniciativa', 'circumstàncies', 'hauria', 'taren', 'seleccionadors', 'designat', 'portat', 'camp', 'corts', 'gentada', 'ocupar', 'lloc', 'bona', 'lògica', 'grans', 'diades', 'obstant', 'ahir', 'camp', 'ponia', 'entendre', 'barcelona', 'registrà', 'bon', 'tros', 'ple', 'bonic', 'perelló', 'organitzadors', 'vet', 'dos', 'homes', 'foren', 'via', 'certa', 'inquietud', 'por', 'públic', 'millors', 'damunt', 'ac', 'nu', 'prendr

In [41]:
print(exist_tesseract)

['plaça', 'catalunya', 'pessetes', 'mes', 'carrer', 'barbarà', 'pesseta', 'mes', 'número', 'aigua', 'freda', 'catalana', 'constituí', 'campionat', 'partit', 'ahir', 'seleccions', 'acabà', 'interessar', 'públic', 'diem', 'circumstàncies', 'portat', 'camp', 'corts', 'gentada', 'grans', 'diades', 'ahir', 'camp', 'barcelona', 'bon', 'ple', 'bonic', 'organitzadors', 'volgués', 'prendre', 'represàlies', 'juga', 'dors', 'castellans', 'gent', 'ocasió', 'recent', 'partit', 'partit', 'començat', 'gran', 'tren', 'decaure', 'autor', 'tres', 'llorenç', 'perelló', 'varen', 'ces', 'hom', 'posat', 'destarotament', 'presidí', 'jugador', 'vist', 'jugar', 'vegades', 'creiem', 'xic', 'verd', 'aplaudint', 'iniciativa', 'precipitaren', 'designat', 'bona', 'lògica', 'corresponia', 'vet', 'dos', 'foren', 'damunt', 'forniren', 'actuació', 'realment', 'bona', 'diminut', 'porter', 'meravellà', 'degut', 'se', 'jugades', 'roig', 'bon', "se'n", 'dia', 'marquessin', 'gols', 'dia', 'setmana', 'tis', 'temences', 'pass

In [42]:
print(non_exist_bnc)

['11113', 'administració:', '3', 'fr', '1316', '11451', 'subscripció:', 'adreça:', 'telegràfica:', 'v/bepna', 'anv', 'hi—núm', '117', '16', '1927', 'solt:', '15', 'catalunyabat', 'castella', '3', 'prés', 'molt—sastre', 'gols—vambé', 'xlorenç', '>', "l'actuació", "l'equip", "català'", "fou'dè-", 'galledada', 'guda', 'fàlla', 'absolut^', 'dél&éumig', "l'afició", "l'eliminació", '—', "l'hem", '•—•', 'castella', 'alçada;', 'dir:', 'perauè', 'potsen', 'precipi', 'raatch', 'haver-lo', 'corres-', 'pelaó', 'lan', 'llorens', "l'encontre", 'ha-', 'match', 'volgjés', 'juga-', 'terreny;', 'tuació', 'manora', 'blau-gràna', 'nieravellà', "l'equip", 'barce', 'un-', 'notabilíssimes', 'be-', 'letíi-quals;', 'moraleda', '-segon', "l'acrediten", 'lutament', 'eategriria', 'pti', 'ulic', 'ovacionar-lo', '-els', 'equipiers', 'una-', 'ova', "elogiable11'", "l'actitud", 'llorens', "prendre's", 'venjanga', 'fà', "''èxit", 'cacil', 'aqaest', 'niatch', 'coiivencer-m>s', 'collocar-se', "a'", "l'en-perelló", "t's"

In [44]:
print(non_exist_tesseract)

['aúminisiració:', '31£', 'estramnger', '150', 'redacció:', '111', '13', ':', 'subscripció:', 'solt:', '15cts', 'galledada', 'pe:', "l'afició", "l'eliminació", 'del:', 'barcelona:', 'peninsular:', 'catàlunya', 'castella', 'en:', 'mateh', 'aquest:', 'hàuria', 'iobstant', 'règistrà', 'trò8', "l'encontre", 'mànera', 'dè', 'madrid:', 'comportaren:-amb', "l'equip", 'barcejona', 'ambel', 'beú', 'prés', 'molí—sastre', 'gols—', 'cambé', 'v', 'bi', 'havia:', "l'actuació", "l'equip", '"català', 'fóu', 'guda', 'hàitila', 'absòtita', 'delsse', '—', "l'hem", '—', "però'", 'éneontres', 'alçada:', 'diri', 'potsen', 'seleceionadors', 'haver-lo', 'ceupar:', 'hoc', 'llorens', '1', 'perello', '—', 'bomès', 'mateh', 'èls', 'inillors', 'terrènyj', 'una:', ':', 'ò', '"plau:', '-gri', 'sunnl', 'dotabilicatmés', 'v\'equip"', '-catalunyà', 'cació', '99', 'comencarà', 'an', 'lés', 'no:', 'abgo:', 'iutàment', 'iel-', 'barcelóna', 'úna', 'nova:', 'la:', 'els:', 'equipieis:', 'una:', 'vai', 'slogiabies', '8', 'tud

In [None]:
# % existing words

In [7]:
results = pd.read_csv("resultats/results_Publicacions.csv")

In [8]:
results

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,391,19170202.txt,['fr'],0.0,3.57,84.96,0.11,0.54,10.82,,27.27,70.57,0.0,40.0,10.0,924,62.23
1,391,19170303.txt,['fr'],0.3,0.0,94.14,0.3,0.66,4.6,33.33,,69.19,0.0,38.46,5.49,1979,65.74
2,391,19170605.txt,"['fr', 'en']",0.9,1.55,86.14,0.8,6.31,4.3,66.67,19.35,71.7,12.5,72.22,20.93,1998,68.22
3,391,19170706.txt,['fr'],0.0,1.16,70.35,2.91,4.36,21.22,,25.0,73.97,0.0,26.67,2.74,344,54.07
4,391,19200312.txt,"['ca', 'it', 'fr']",0.0,28.94,36.37,6.03,2.65,26.02,,3.47,6.14,0.0,10.19,4.43,4078,4.66
5,391,19200713.txt,['fr'],0.63,3.94,84.6,2.51,0.54,7.79,42.86,11.36,56.72,7.14,0.0,5.75,1117,49.33
6,391,19201114.txt,"['ca', 'fr']",0.58,5.28,80.71,0.66,2.39,10.39,28.57,14.06,55.06,0.0,48.28,8.73,1213,47.4
7,391,19240516.txt,"['ca', 'fr']",0.0,6.44,43.14,0.0,3.36,47.06,,8.7,59.09,,25.0,2.98,357,28.29
8,391,19241019.txt,['fr'],2.04,3.07,75.47,0.0,0.51,18.91,33.33,5.56,46.95,,0.0,1.8,587,36.63
9,AC,193101.txt,"['es', 'ca']",76.67,12.03,1.15,1.74,1.01,7.41,76.82,8.89,11.67,10.99,30.19,25.77,5239,62.51


In [18]:
results[results["revista"]=="matrencada"]

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
358,matrencada,19241101.txt,['ca'],1.93,84.19,4.09,4.56,0.62,4.61,53.0,37.67,13.21,6.78,18.75,18.41,5181,34.55
359,matrencada,19241102.txt,['ca'],2.56,84.2,2.96,4.19,0.45,5.65,33.33,26.26,11.71,6.55,19.35,13.55,6925,24.43
360,matrencada,19241203.txt,"['ca', 'it']",1.98,82.09,2.48,5.71,0.6,7.13,36.44,26.76,14.86,4.12,5.56,13.41,5958,24.29
361,matrencada,19241204.txt,"['ca', 'it']",2.54,82.29,4.6,5.48,0.27,4.82,34.35,39.45,11.81,4.96,21.43,14.52,5150,34.91
362,matrencada,19250105.txt,"['ca', 'it']",1.74,79.78,2.06,6.4,0.55,9.48,28.95,23.38,9.63,4.76,5.56,11.09,6563,20.74
363,matrencada,19250106.txt,"['ca', 'it']",1.87,80.56,3.08,6.62,0.74,7.13,40.71,29.14,11.29,5.75,15.56,17.87,6044,26.36


In [29]:
results[results["total_existing"]<60][["revista", "publicacio", "total_existing"]].sort_values(["revista","total_existing"])

Unnamed: 0,revista,publicacio,total_existing
4,391,19200312.txt,4.66
7,391,19240516.txt,28.29
8,391,19241019.txt,36.63
6,391,19201114.txt,47.4
5,391,19200713.txt,49.33
3,391,19170706.txt,54.07
16,AC,193208.txt,51.55
15,AC,193207.txt,58.13
50,amicarts,19271221.txt,45.92
46,amicarts,19270817.txt,53.18


In [33]:
results[results["total_existing"]<50][["revista", "publicacio", "total_existing"]].sort_values(["revista","total_existing"])

Unnamed: 0,revista,publicacio,total_existing
4,391,19200312.txt,4.66
7,391,19240516.txt,28.29
8,391,19241019.txt,36.63
6,391,19201114.txt,47.4
5,391,19200713.txt,49.33
50,amicarts,19271221.txt,45.92
62,anti,19310602.txt,47.41
218,esportcat,192708132.txt,46.66
226,helix,19291006.txt,46.59
221,helix,19290201.txt,47.78


In [53]:
results[["revista", "publicacio", "total_existing"]].to_csv("bloc1_quality.csv", index=False)

In [31]:
over_50 = sum(results["total_existing"]>50)
over_50

391

In [24]:
len(results)

415

In [27]:
over_60/len(results)

0.8240963855421687

In [32]:
over_50/len(results)

0.9421686746987952

In [51]:
px.histogram(results, x="total_existing", nbins=100, range_x=(0,100), color="revista", 
             title="Histograma paraules existents", labels={"total_existing":"% paraules existents"},
             height=600)

In [125]:
french_rows = ["fr" in r for r in results["idiomes"]]
px.histogram(results[french_rows], x="fr_existing", nbins=101, range_x=(0,100), color="revista")

In [126]:
catalan_rows = ["ca" in r for r in results["idiomes"]]
px.histogram(results[catalan_rows], x="ca_existing", nbins=101, range_x=(0,100), color="revista")

In [129]:
spanish_rows = ["es" in r for r in results["idiomes"]]
px.histogram(results[spanish_rows], x="es_existing", nbins=101, range_x=(0,100), color="revista")

In [130]:
italian_rows = ["it" in r for r in results["idiomes"]]
px.histogram(results[italian_rows], x="it_existing", nbins=101, range_x=(0,100), color="revista")

In [131]:
english_rows = ["en" in r for r in results["idiomes"]]
px.histogram(results[english_rows], x="en_existing", nbins=101, range_x=(0,100), color="revista")

In [132]:
px.histogram(results[results["non_identified_perc"] > 5], x="non_identified_existing", nbins=101, range_x=(0,100), color="revista")

In [43]:
results_2 = pd.read_csv("resultats/results_Dades_revistes_2.csv")

In [44]:
results_2.sort_values("total_existing")

Unnamed: 0,revista,publicacio,idiomes,es_perc,ca_perc,fr_perc,it_perc,en_perc,non_identified_perc,es_existing,ca_existing,fr_existing,it_existing,en_existing,non_identified_existing,n_words,total_existing
0,01 La Publicitat,3944.txt,['ca'],2.63,85.95,1.67,3.29,1.02,5.44,30.12,25.26,9.46,4.08,9.09,11.55,33563,23.52
1,02 La Veu de Catalunya,19130306-25232-11.txt,['ca'],2.22,93.67,0.95,1.05,0.26,1.86,33.93,34.11,7.92,10.94,12.31,20.68,25261,33.3
5,06 Esquella torratxa,19080515-33442.txt,"['es', 'ca']",12.43,82.26,1.2,1.18,0.28,2.64,36.44,34.33,14.29,5.45,0.0,17.07,4657,33.46
18,19 Art revista,19330101-74.txt,"['es', 'ca']",10.59,84.87,0.76,0.61,1.34,1.83,19.6,40.29,12.0,15.0,18.18,13.33,3278,36.94
6,07 Cu cut,19060510-4181.txt,"['es', 'ca']",6.6,83.28,1.3,3.25,0.36,5.2,27.95,40.87,8.0,3.2,28.57,17.0,3846,37.08
7,08 Papitu,19150310-4462.txt,"['es', 'ca']",6.23,85.72,1.65,1.74,0.74,3.91,46.63,39.56,17.65,9.26,17.39,23.97,3096,38.34
15,16 Joventut,19030409-3861.txt,"['es', 'ca']",6.84,89.6,1.18,0.91,0.18,1.28,37.37,39.87,9.18,6.58,0.0,19.63,8330,38.7
2,03 Mirador,16.txt,['ca'],1.66,91.27,2.45,0.79,0.19,3.64,35.71,44.03,13.53,5.97,37.5,12.7,8441,41.69
4,05 Daci dalla,19180501-5640.txt,['ca'],2.6,92.38,1.15,1.33,0.33,2.21,42.93,46.78,10.5,5.74,15.38,18.1,15757,44.98
9,10 Vell i nou,447.txt,['ca'],1.98,87.76,4.0,1.58,1.55,3.13,49.28,53.55,14.39,10.91,22.22,22.02,3479,49.76


In [139]:
borinot = 181
hispania = 94
nova_revista = 32
revista = 204
revista_nova = 46


In [140]:
borinot + hispania + nova_revista + revista + revista_nova

557

In [46]:
results_2[["revista", "total_existing"]].to_csv("bloc2_quality.csv", index=False)

In [None]:
def test_read_and_process_file(revista, file):
    # initialize results dataframe
    df = pd.DataFrame(columns=columns)
    revista_words = {}
    read_and_process_file(df, revista, revista_words, file)
    json.dump(all_words, open(f"all_words_test_{revista}_{file}.json", "w"))
    return df

In [None]:
test_read_and_process_file(revista="AC", file="193312.txt")