Token counts and some other stats per genre (broad)
================

We load the database and build dataframes using Pandas and Python:

In [1]:
from collections import defaultdict, Counter
import sqlite3 
import pandas as pd
from polyglot.text import Text
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")
###########################################
def token_counts(lines): #Function for tokens count using POLYGLOT#  #Input: a list where each element is a textline         Output: Total tokens count (using polyglot word tokenizer)
    #strings_clean=[]
    counter=0
    for t in lines:
        try:
            text = Text(t)#Polyglot object
        except:
            continue
            
        try:
            tokenized_words=text.words  #Polyglot (orthogrpahic word borders)
        except:
            tokenized_words=t.split() #In case buggy polyglot finds something non utf-8 and it breaks the whole pipeline

        for w in tokenized_words: #In cases we have a word (size 1) and it's not alphanumeric.
            if (len(w) ==1) and (w.isalnum()==False): 
                continue  #we skip this token
            counter=counter+1  #We just increment a counter since storing all the database words on a list could be very RAM expensive
            #strings_clean.append(w) 
    return counter #Tokens count

In [2]:
####This may take several seconds (and several GB of RAM memory!!)######
#1. We load the database:
con = sqlite3.connect('../../Database/test.sqlite3')
cursor = con.cursor()

#2. We select the info that we are going to need  and merge it into a big Pandas dataframe:  

teddi= pd.read_sql('SELECT id, iso639_3 FROM language;', con).rename(columns={"id": "language_id"}).merge(
       pd.read_sql('SELECT id, language_id FROM corpus;', con).rename(columns={"id": "corpus_id"}), on="language_id").merge(
       pd.read_sql('SELECT id, corpus_id, genre_broad, writing_system FROM file;', con).rename(columns={"id": "file_id"}), on="corpus_id").merge(
       pd.read_sql('SELECT id, file_id, text FROM line;', con).rename(columns={"id": "line_id"}), on="file_id")

#teddi[['genre_broad','writing_system']] = teddi[['genre_broad','writing_system']].astype('category')  #We change some data types, just to optimize a bit the memory usage
#clc.memory_usage().sum() / (1024**2) #converting to megabytes

# Overview

In [3]:
teddi

Unnamed: 0,language_id,iso639_3,corpus_id,file_id,genre_broad,writing_system,line_id,text
0,1,abk,1,1,professional,Cyrl,1,Ауаҩытәыҩса изинқәа Зегьеицырзеиҧшу Адекларациа
1,1,abk,1,1,professional,Cyrl,2,Алагалажәа
2,1,abk,1,1,professional,Cyrl,3,"Дызусҭзаалак, ауаатәыҩсатә ҭаацәара иалахәу иҳ..."
3,1,abk,1,1,professional,Cyrl,4,иара убас ауаҩы изинқәа ратәамбареи хырҩаа рым...
4,1,abk,1,1,professional,Cyrl,5,"иара убас, ауаҩы аџьамыҕәеи ахәуреи иаартны ды..."
...,...,...,...,...,...,...,...,...
19884221,100,zul,140,23326,professional,Latn,19884222,Wonke umuntu uneqhaza emphakathini okuwukuphel...
19884222,100,zul,140,23326,professional,Latn,19884223,"Ekusebenziseni amalungelo nenkululeko yakhe, u..."
19884223,100,zul,140,23326,professional,Latn,19884224,Lamalungelo nenkululeko akunakusetshenziswa ng...
19884224,100,zul,140,23326,professional,Latn,19884225,Isigaba 30


In [4]:
iso_types=teddi["iso639_3"].value_counts()
writingsystem_types=teddi["writing_system"].value_counts()
genrebroad_types=teddi["genre_broad"].value_counts()
#############################
print("Number of languages:", len(iso_types))
print("Number of writing systems:", len(writingsystem_types))
print("Number of genres (broad):", len(genrebroad_types))

Number of languages: 89
Number of writing systems: 16
Number of genres (broad): 5



Top and Bottom 5 languages by number of text lines:

In [5]:
iso_types

fin    1910301
deu    1517159
ell    1473292
spa    1455101
eng    1375318
        ...   
dni         23
kio         14
kut         11
bmi         10
crk         10
Name: iso639_3, Length: 89, dtype: int64

Writing systems by number of text lines:

In [6]:
writingsystem_types

Latn    11627705
Grek     1473292
Arab     1029092
Cyrl     1013397
Hebr      997606
Hans      973814
Jpan      856773
Thai      818355
Kore      768601
Geor      156798
Deva      106536
Mymr       31088
Hang       30895
Hani          93
Hant          92
Knda          89
Name: writing_system, dtype: int64

Genres (broad) by number of text lines:

In [7]:
genrebroad_types

non-fiction     16113887
fiction          3763509
professional        4078
conversation        2483
grammar              269
Name: genre_broad, dtype: int64

# Statistics per genre

In [8]:
#We filter dataframes by genre
professional=teddi.loc[teddi['genre_broad'] == "professional"]
nonfiction=teddi.loc[teddi['genre_broad'] == "non-fiction"]
grammar=teddi.loc[teddi['genre_broad'] == "grammar"]
conversation=teddi.loc[teddi['genre_broad'] == "conversation"]
fiction=teddi.loc[teddi['genre_broad'] == "fiction"]

Example of dataframe filtered by genre:

In [9]:
fiction

Unnamed: 0,language_id,iso639_3,corpus_id,file_id,genre_broad,writing_system,line_id,text
693191,20,fin,29,2079,fiction,Latn,2091371,Kirj.
693192,20,fin,29,2079,fiction,Latn,2091372,Ari Aalto
693193,20,fin,29,2079,fiction,Latn,2091373,"Kuopiossa,"
693194,20,fin,29,2079,fiction,Latn,2091374,"O. W. Backman'in kirjapaino,"
693195,20,fin,29,2079,fiction,Latn,2091375,1900.
...,...,...,...,...,...,...,...,...
18799297,92,pes,100,15141,fiction,Arab,13447131,ــ همیشه…هر وقت آدم بخواد !
18799298,92,pes,100,15141,fiction,Arab,13447132,اراسموس به او زل زد. او رام شده بود ولی اهلیِ ...
18799299,92,pes,100,15141,fiction,Arab,13447133,صورت اراسموس اما همچنان عجیب، پُر شرارت و شکاک...
18799300,92,pes,100,15141,fiction,Arab,13447134,ــــــــــــــــــــــــــــــــــــــــــــــ...


We print a table with number of languages, scripts and token count per genre (broad):  
*(Using polyglot tokenizer)

Warning: This takes a long time, since it is tokezing all the texts in the DB

In [10]:
print ("{:<10} {:<10} {:<10} {:<10}".format("Genre", "Langs", "Tokens", "Scripts"))
print ("{:<10} {:<10} {:<10}{:<10}".format("conversation", str(len(conversation["iso639_3"].value_counts())),str(token_counts(conversation["text"].tolist())),str(len(conversation["writing_system"].value_counts()))))
print ("{:<10} {:<10} {:<10}{:<10}".format("fiction", str(len(fiction["iso639_3"].value_counts())),str(token_counts(fiction["text"].tolist())), str(len(fiction["writing_system"].value_counts()))))
print ("{:<10} {:<10} {:<10}{:<10}".format("grammar", str(len(grammar["iso639_3"].value_counts())), str(token_counts(grammar["text"].tolist())), str(len(grammar["writing_system"].value_counts()))))
print ("{:<10} {:<10} {:<10}{:<10}".format("nonfiction", str(len(nonfiction["iso639_3"].value_counts())), str(token_counts(nonfiction["text"].tolist())), str(len(nonfiction["writing_system"].value_counts()))))
print ("{:<10} {:<10} {:<10}{:<10}".format("professional", str(len(professional["iso639_3"].value_counts())), str(token_counts(professional["text"].tolist())), str(len(professional["writing_system"].value_counts()))))


Genre      Langs      Tokens     Scripts   
conversation 10         15835     1         
fiction    12         36811339  7         
grammar    5          1271      1         
nonfiction 73         101588748 13        
professional 40         80092     15        
