In [None]:
# codigo para leer un archivo del drive
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Autenticando y creando un cliente pydrive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# 2. Obtener archivo
corpus = drive.CreateFile({'id': '1H-qnVgYLs2L8hSreQeUaRHXTZryHT0rb'})
corpus.GetContentFile('quechua.sdx')

tagger = drive.CreateFile({'id': '1L9SCRh6Kc5sGYmV3t30-jvMJtxtEOL-2'})
tagger.GetContentFile('tagger_quechua.yaml')

In [None]:
import random
import yaml

In [None]:
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag import BigramTagger
from nltk.tag import RegexpTagger
from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger
from nltk.tag import brill
from nltk.tag import brill_trainer
from nltk.tbl import Template
from nltk.tokenize import BlanklineTokenizer

# Training


In [None]:
# Brill tagger parameters
max_rules = 300
min_score = 3

In [None]:
# Training parameters
development_size = 0
train = .85

In [None]:
# View data from quechua.sdx
with open('quechua.sdx', encoding='UTF-8') as f:
  print(f.read())


állqu|NN
.|PUNC

allqukúna|NN
.|PUNC

allqukunápaq|NN 
.|PUNC

allqukunapáqmi|NN 
.|PUNC

wási|NN
.|PUNC

wasícha|NN
.|PUNC

wasichakúna|NN
.|PUNC

wasichakunamánta|NN
.|PUNC

aqu|NN
.|PUNC

allqu|NN
.|PUNC

runa|NN
.|PUNC

warmi|NN
.|PUNC

llaqta|NN
.|PUNC

llaqtacha|NN
.|PUNC

llaqtachayki NN 
.|PUNC

llaqtachaykichik|NN
.|PUNC

llaqtachaykichikkuna|NN 
.|PUNC

llaqtachaykichikkunamanta|NN
.|PUNC

llaqtachaykichikkunamantachá|NN
.|PUNC

yacha|VB
.|PUNC

yachachi|VB 
.|PUNC

yachachinaya|VB
.|PUNC

yachachinayachka|VB
.|PUNC

yachachikuchkan|VB
.|PUNC

yachachikuchkanku|VB
.|PUNC

ñuqa|PRP
aychatam|NN
mikuni|VB
.|PUNC

tanta|NN 
.|PUNC

t’anta|NN
.|PUNC

thanta|ADJ 
.|PUNC

kanka|ADJ
.|PUNC

k’anka|NN
.|PUNC

khanka|ADJ
.|PUNC

Ruranapaq|NN
.|PUNC

kay|VB
.|PUNC

kapuy|VB
.|PUNC

apay|VB
.|PUNC

apapuy|VB
.|PUNC

churay|VB
.|PUNC

churapuy|VB 
.|PUNC

kichay|VB
.|PUNC

kichapuy|VB 
.|PUNC

kutiy|VB
.|PUNC

kutipuy|VB 
.|PUNC

ruray|VB
.|PUNC

 Ñuqa|PRP 
 Carlos|NN
 Choquehuancam|NN 


In [None]:
# Read data from quechua.sdx
data = TaggedCorpusReader('.', r'quechua.sdx', sep='|', 
                          sent_tokenizer=BlanklineTokenizer(),
                          encoding='UTF-8')
data

<TaggedCorpusReader in '/content'>

In [None]:
# Get the list of tagged sentences
tagged_data = data.tagged_sents()
tagged_data[60:65]

[[('Taytaypa', 'PRP'), ('sutin', 'NN'), ('Romulom', 'NN'), ('.', 'PUNC')],
 [('Mamaypa', 'PRP'), ('sutin', 'NN'), ('Beatrizmi', 'NN'), ('.', 'PUNC')],
 [('Ñuqapa', 'PRP'),
  ('huk', 'PRP'),
  ('ñañay', 'NN'),
  ('kanmi', 'PRP'),
  ('.', 'PUNC')],
 [('Ñañaypa', 'PRP'), ('sutin', 'NN'), ('Alejandram', 'NN'), ('.', 'PUNC')],
 [('Manam', 'ADV'), ('turay', 'NN'), ('kanchu', 'VB'), ('.', 'PUNC')]]

In [None]:
development_size = len(tagged_data)

In [None]:
# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]
tagged_data_list[60:65]

[[('taytaypa', 'PRP'), ('sutin', 'NN'), ('romulom', 'NN'), ('.', 'PUNC')],
 [('mamaypa', 'PRP'), ('sutin', 'NN'), ('beatrizmi', 'NN'), ('.', 'PUNC')],
 [('ñuqapa', 'PRP'),
  ('huk', 'PRP'),
  ('ñañay', 'NN'),
  ('kanmi', 'PRP'),
  ('.', 'PUNC')],
 [('ñañaypa', 'PRP'), ('sutin', 'NN'), ('alejandram', 'NN'), ('.', 'PUNC')],
 [('manam', 'ADV'), ('turay', 'NN'), ('kanchu', 'VB'), ('.', 'PUNC')]]

In [None]:
# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)

In [None]:
# Training set
training_data = tagged_data_list[:cutoff]
training_data[:2]

[[('qamkuna', 'PRP'), ('kankichik', 'VB'), ('.', 'PUNC')],
 [('mayqintaq', 'PRP'), ('aswan', 'ADJ'), ('sumaq', 'VB'), ('?', 'PUNC')]]

In [None]:
# Evaluation set
evaluation_data = tagged_data_list[cutoff: development_size]
evaluation_data[2:5]

[[('ñuqanchik', 'PRP'), ('tiyanchik', 'VB'), ('.', 'PUNC')],
 [('apapuy', 'VB'), ('.', 'PUNC')],
 [('llaqta', 'NN'), ('.', 'PUNC')]]

In [None]:
# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'),
                             (r'.*', 'NN')])

In [None]:
# Unigram tagger
unigram_tagger = UnigramTagger(training_data, 
                               backoff=nn_cd_tagger)
print("Unigram accuracy: ")
print(unigram_tagger.evaluate(evaluation_data))

Unigram accuracy: 
0.8356164383561644


In [None]:
# Bigram tagger 
bigram_tagger = BigramTagger(training_data, 
                             backoff=unigram_tagger)
print("Bigram accuracy: ")
print(bigram_tagger.evaluate(evaluation_data))

Bigram accuracy: 
0.8356164383561644


In [None]:
# Trigram tagger 
trigram_tagger = TrigramTagger(training_data, 
                               backoff=bigram_tagger)
print("Trigram accuracy: ")
print(trigram_tagger.evaluate(evaluation_data))

Trigram accuracy: 
0.8356164383561644


In [None]:
# Brill tagger templates
templates = [
    Template(brill.Pos([1, 1])),
    Template(brill.Pos([2, 2])),
    Template(brill.Pos([1, 2])),
    Template(brill.Pos([1, 3])),
    Template(brill.Word([1, 1])),
    Template(brill.Word([2, 2])),
    Template(brill.Word([1, 2])),
    Template(brill.Word([1, 3])),
    Template(brill.Pos([-1, -1]), brill.Pos([1, 1])),
    Template(brill.Word([-1, -1]), brill.Word([1, 1])),
]

In [None]:
# First iteration
trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates)
brill_tagger = trainer.train(training_data, max_rules, min_score)
print("Initial Brill accuracy:")
print(brill_tagger.evaluate(evaluation_data))

Initial Brill accuracy:
0.8356164383561644


In [None]:
# 10 Folding
for i in range(1, 5):
    # Random splitting
    random.seed(len(tagged_data_list))
    random.shuffle(tagged_data_list, random.random)
    cutoff = int(development_size * train)
    training_data = tagged_data_list[:cutoff]
    evaluation_data = tagged_data_list[cutoff:development_size]

    print("Fold: ")
    print(i)

    # Training
    brill_tagger = trainer.train(training_data, 
                                 max_rules, 
                                 min_score)

    # Evaluation 
    print("Accuracy: ")
    print(brill_tagger.evaluate(evaluation_data))

    i = i + 1

Fold: 
1
Accuracy: 
0.9873417721518988
Fold: 
2
Accuracy: 
0.9753086419753086
Fold: 
3
Accuracy: 
0.96
Fold: 
4
Accuracy: 
0.9367088607594937


In [None]:
# Saving my tagger
with open('tagger_quechua.yaml', 'w') as file_writing:
    yaml.dump(brill_tagger, file_writing)

In [None]:
# Saving in drive
with open('tagger_quechua.yaml', 'r') as f:
   tagger.SetContentString(f.read())
   tagger.Upload()
print('Done!')

Done!


# Tagger

In [None]:
import yaml
import os
from nltk.tag.brill import BrillTagger
from yaml.loader import Loader
from yaml.parser import ParserError

In [None]:
class Tagger:
    def __init__(self, tagger):
        self.myTagger = tagger

    @classmethod
    def load(cls, modelFile):
        if not os.path.exists(modelFile):
            raise FileNotFoundError("The model file: {} not found.".format(modelFile))
        try:
            with open(modelFile) as file:
                myTagger = yaml.load(file, Loader=yaml.Loader)
            if not isinstance(myTagger, BrillTagger):
                raise TypeError("The model file: {} could not be loaded as a nltk.tag.brill.BrillTagger object".format(
                    modelFile
                ))
            return cls(myTagger)
        except ParserError as error:
            print(error)
            raise TypeError("Could not load file {} as yaml file.".format(modelFile))

    # Tagger function
    def tag(self, sentence):
        if not isinstance(sentence, str):
            raise TypeError("Input sentence has to be of type str. Is of type {}"
            .format(type(sentence)))
        # Lower input
        temp = [[t.lower()] for t in sentence.split()]
        return_list = []
        # Find tags
        for token in temp:
            return_list.append(self.myTagger.tag(token))

        return_list = [t for [t] in return_list]
        # Correct tags for printing
        tag_list = [y for (x, y) in return_list]
        tag_list = [(y.lower()).title() for y in tag_list]

        # Zip input and tags
        temp_list = zip([t for t in sentence.split()], tag_list)

        return temp_list

    # Make the initiated class callable in the same way as a function
    def __call__(self, sentence):
        return list(self.tag(sentence))

# Pos Tagger

In [None]:
import yaml
from yaml.loader import Loader

In [None]:
# Open the file where tagger is saved
taggerFileName = 'tagger_quechua.yaml'
myTagger = Tagger.load(taggerFileName)

In [None]:
def tag(sentence):
    return myTagger.tag(sentence)

# Test

In [None]:
list(tag("Ñuqa Guido VanRossum kani ."))

[('Ñuqa', 'Prp'),
 ('Guido', 'Nn'),
 ('VanRossum', 'Nn'),
 ('kani', 'Prp'),
 ('.', 'Punc')]

# Aplicativo


In [7]:
!pip install flask-ngrok
!pip install flask==0.12.2  

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting flask==0.12.2
  Downloading Flask-0.12.2-py2.py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 733 kB/s 
Installing collected packages: flask
  Attempting uninstall: flask
    Found existing installation: Flask 1.1.4
    Uninstalling Flask-1.1.4:
      Successfully uninstalled Flask-1.1.4
Successfully installed flask-0.12.2


In [None]:
# flask_ngrok_example.py
from flask import Flask
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

@app.route("/")
def hello():
    return "Hello World!"

if __name__ == '__main__':
    app.run()  # If address is in use, may need to terminate other sessions:
               # Runtime > Manage Sessions > Terminate Other Sessions

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://f83c0581e6bc.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
