# Allennlp for Multilingual Text Classification. 

In [1]:
import pandas as pd
import random
random.seed(30)
dataset = pd.read_csv('../data/dataset.csv', sep='\t', header=None).applymap(str)
dataset.columns = ["language","label","text"]
languagesData=[]
loc = 0
languages = dataset[dataset.columns[0]].unique()
for i in languages:
    name = languages[loc]+"Data" 
    globals()[name] = pd.DataFrame( dataset[dataset.language == i])
    loc += 1

# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
def preprocessing(text):
    text = text.str.lower()                                              #Lower
    text = text.apply(lambda x: re.split('https:\/\/.*', str(x))[0])     #Remove Secured URL
    text = text.apply(lambda x: re.split('http:\/\/.*', str(x))[0])      #Remove URL
    text = text.str.replace('\d+', '')                                   #Remove_numbers
    text = text.str.replace('[^\w\s]','')                                #Remove_punctuations
    text = text.str.strip()                                              #remove_blank_space
    text = text.replace('\s+', ' ', regex=True)                          
    return text

In [10]:
#!pip install allennlp

[31mCannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m


In [7]:
from typing import Dict
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer
from typing import Dict
import logging
import csv
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

logger = logging.getLogger(__name__)  
# @DatasetReader.register("data-reader")
class MultilingualDatasetReader(DatasetReader):
    def __init__(self,    
        lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy = lazy)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(file_path, "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter=',')
            for row in tsv_in:
                if len(row) == 2:
                    Instance = self.text_to_instance( article=row[1],label=row[0])
                    yield Instance

    @overrides
    def text_to_instance(self,  
                 article: str,
                 label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        tokenized_article = self._tokenizer.tokenize(article)
        fields["tokens"] = TextField(tokenized_article, self._token_indexers)
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)
    
    
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        self.loss_function = torch.nn.CrossEntropyLoss()

        def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        
            mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)
            output["acc"] = self.accuracy(logits, label)


        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {'accuracy': self.accuracy.get_metric(reset)}
        return metrics
    
    
from overrides import overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


# @Predictor.register('text_classifier')
class TextClassifierPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single class for it.  In particular, it can be used with
    the :class:`~allennlp.models.basic_classifier.BasicClassifier` model
    """
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        return self._dataset_reader.text_to_instance(sentence)
    

ModuleNotFoundError: No module named 'allennlp'