**Necessary libraries and paths**

In [0]:
from fastai.text import * 
from fastai.callbacks import *
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle as pkl

In [0]:
#This ever will be 'True
SAMPLE = True
PATH = Path('./sample') if SAMPLE else Path('.')
PATH.mkdir(exist_ok=True)

In [0]:
MODELS_PATH = PATH / 'models'
MODELS_PATH.mkdir(exist_ok=True)

**DOWNLOAD THE DATA**
> Now we going to connect to Google Drive to download the project files





In [0]:
if not (PATH / 'train.csv').exists() or not (PATH / 'test.csv').exists():
  !pip install PyDrive
  import os
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials
  
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

We going to download now the Train and Test Data.

In [0]:
if not (PATH / 'train.csv').exists():
  #The 'id' of your file share using advanced sharing for collaborators
  TrainData = drive.CreateFile({'id': '1k4N_Ia7Z_5QfZWg16AOmBMk89_Szhskl'})
  #The 'name of your file' that you uploaded in the 'sample' folder.
  TrainData.GetContentFile(PATH / 'train.csv.gz')

In [0]:
if not (PATH / 'test.csv').exists():
  TestData = drive.CreateFile({'id': '105vlQHX9-mveztq6wHR9Ekfm7Yfq7T4F'})
  TestData.GetContentFile(PATH / 'test.csv')

The train.csv.gz file is going to extract right now.

In [0]:
if not (PATH / 'train.csv').exists():
  PATH_TRAIN_COMP = PATH / 'train.csv.gz'
  !gzip -d $PATH_TRAIN_COMP

**Data Visualization and Pre-processing**


> Now, we are going to delete all of characters what contain accents, uppercase words, &c.



In [0]:
Asuffix = 'A-ret_cat_'
def return_categories(PATH = PATH, filename = 'train.csv', save = True, old = True, suffix = Asuffix):
  list_categories=list()
  if old == True and (PATH / (suffix + filename)).exists():
      data_file = open(PATH / ( suffix + filename), 'r')
      text_file=data_file.read()
      data_file.close()  
      list_categories=text_file.split(' ')

  if old == False or not(PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / filename)
    title_category=df['category'].unique()
    for i in range(len(title_category)-1):
      list_categories.append(title_category[i])
    list_categories.append(title_category[-1])
    list_categories.sort()
    if save == True:
      text_file=''
      for i in range(len(list_categories)-1):
        text_file=text_file+list_categories[i]+' '
      text_file=text_file+list_categories[-1]
      data_file = open(PATH / (suffix + filename), 'w')
      data_file.write(text_file)
      data_file.close()
  return list_categories



Now we are obtaining the total of categories in all of data.

In [0]:
AAsuffix = 'AA-ret_num_cat_' 
def return_number_categories(PATH = PATH, filename = 'train.csv', save = True, old = True, suffix = AAsuffix):#'./sample', filename = 'train.csv', save = True, old = True, suffix = AAsuffix):
  if old == True and (PATH / (suffix + filename)).exists():
    file = open(PATH / (suffix + filename),'r')
    data=file.read()
    file.close()
    number_categories = int(data)
  if old == False or not(PATH / (suffix + filename)).exists():
    list_categories = return_categories(PATH, filename, save, old)
    number_categories = len(list_categories)
    if save == True:
      data = str(number_categories)
      file = open(PATH / (suffix + filename), 'w')
      file.write(data)
      file.close()
  return number_categories

***Parameters***

> *Setting the parameters*



Now we the split the data in both languages.

In [None]:
Bsuffix = 'B-spl_lan_'
def split_language(PATH, filename, language, save = True, old = True, suffix = Bsuffix): # language = 'spanish' or 'portuguese'
  suffix = suffix + language[:3] + '_'
  if old == True and (PATH / (suffix + filename)).exists():
      df = pd.read_csv(PATH / (suffix + filename))
  if old == False or not(PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / filename)
    df = df[df['language'] == language]
    if save == True:
      df.to_csv(PATH / (suffix + filename), index=False)
  return df


In [None]:
def return_quantities(PATH, filename, number_categories):
  list_categories_language = return_categories(PATH, filename, save = False, old = False)
  quantity_categories = np.zeros(number_categories)
  df = pd.read_csv(PATH / filename)
  df = df.groupby(['category']).count().title
  for i in range(len(df)):
    quantity_categories[list_categories.index(list_categories_language[i])] = df[i]
  return quantity_categories


In [0]:
Csuffix = 'C-spl_set_'
def split_set(PATH, filename, pi, seed, return_train = True, old = True, suffix = Csuffix):
  if old == True and (PATH / (suffix + 'train_' + filename)).exists() and (PATH / (suffix + 'test_' + filename)).exists():
      df_train = pd.read_csv(PATH / (suffix + 'train_' + filename))
      df_test = pd.read_csv(PATH / (suffix + 'test_' + filename))
  if old == False or not((PATH / (suffix + 'train_' + filename)).exists() and (PATH / (suffix + 'test_' + filename)).exists()):
    df = pd.read_csv(PATH / filename)
    df_test, df_train =  train_test_split(df, test_size=int(pi*len(df)), random_state=seed, stratify=df.category)
    df_train.to_csv(PATH / (suffix + 'train_' + filename), index=False)
    df_test.to_csv(PATH / (suffix + 'test_' + filename), index=False)
  if return_train == True:
    return df_train
  if return_train == False:
    return df_test


This function returns the total of elements with the quality 'reliable' for each category.

In [0]:
def return_qualities(PATH, filename, number_categories, quantity_categories):
  quality_categories_reliable = return_quantities(PATH, filename, number_categories)
  quality_categories = np.zeros(number_categories)
  quality_categories_rel = np.zeros(number_categories)
  quality_categories_unr = np.zeros(number_categories)
  for i in range(number_categories):
    quality_categories[i] = quantity_categories[i]
    quality_categories_rel[i] = quality_categories_reliable[i]
    quality_categories_unr[i] = quantity_categories[i] - quality_categories_reliable[i]
  return quality_categories, quality_categories_rel, quality_categories_unr


This function save a new data with the language choosed for the user.

In [None]:


Dsuffix = 'D-spl_lab_qua_'
def split_label_quality(PATH, filename, label_quality, save = False, old = True, suffix = Dsuffix): # label_quality = 'reliable' or 'unreliable'
  suffix = suffix + label_quality[:3] + '_'
  if old == True and (PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / (suffix + filename))
  if old == False or not(PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / filename)
    df = df[df['label_quality'] == label_quality]
    if save == True:
      df.to_csv(PATH / (suffix + filename), index=False)
  return df


This a constant created to found the number of elements necessary for extract the mount of elements type 'reliable' that the user choosed.

In [0]:
def return_gamma(num_rel, num_D, alpha, pi, tetha = 0.0):
  proportion_rel = num_rel / num_D
  alpha_dec = float("{0:.2f}".format(alpha))
  proportion_rel_dec = float("{0:.2f}".format(proportion_rel))
  if alpha_dec > proportion_rel_dec:
    gamma = (alpha / proportion_rel)*(1 + tetha) - tetha
  if alpha_dec == proportion_rel_dec:
    gamma = 1 + (1 / (pi*num_D) )
  if alpha_dec < proportion_rel_dec:
    gamma = ((1 - alpha) / (1 - proportion_rel))*(1 + tetha) - tetha
  return gamma

This is the final function to split the data with the language that the user wanted extract.

In [None]:
Esuffix = 'E-spl_dat_'
#(!)# filename = 'train.csv'
def split_data(PATH, filename, language, pi, alpha, seed, save = True, old = True, suffix = Esuffix):
  tetha = 0.00 # Parameter for adjust the parameter gamma to prevening the error in the function.
  if old == True and (PATH / (suffix + language[:3] + '_' + filename)).exists():
    df = pd.read_csv(PATH / (suffix + language[:3] + '_' + filename))
  if old == False or not(PATH / (suffix + language[:3] + '_' + filename)).exists():
    df = split_language(PATH, filename, language, old = True) #old = old)
    num_D = len(df)
    df_rel = split_label_quality(PATH, Bsuffix + language[:3] + '_' + filename, 'reliable', save = False, old = old)
    num_rel = len(df_rel)
    gamma = return_gamma(num_rel, num_D, alpha, pi, tetha = 0.00)
    df = split_set(PATH, (Bsuffix + language[:3] + '_' + filename), gamma*pi, seed, return_train = True, old = old)
    num_D_train = len(df)
    df_rel = split_label_quality(PATH, Csuffix + 'train_' + Bsuffix + language[:3] + '_' + filename, 'reliable', save = False, old = old)
    num_rel_train = len(df_rel)
    df_unr = split_label_quality(PATH, Csuffix + 'train_' + Bsuffix + language[:3] + '_' + filename, 'unreliable', save = False, old = old)
    num_unr_train = len(df_unr)
    df_residual, df_unr = train_test_split(df_unr, test_size=int(num_D*pi-num_rel_train), random_state = seed) # The line: #, stratify=df_unr.category) was deleted because it generated error.
    df = pd.concat([df_rel,df_unr])
    if save == True:
      df.to_csv(PATH / (suffix + language[:3] + '_' + filename), index=False)
    df_conj = split_set(PATH, (Bsuffix + language[:3] + '_' + filename), gamma*pi, seed, return_train = False, old = True)
    df_conj = pd.concat([df_conj, df_residual])
    if save == True:
      df_conj.to_csv(PATH / (suffix + language[:3] + '_' + '~' + filename), index=False)
  return df

This parameter only activates when the proportion of elements type 'reliable' that the user choosed is equal to the proportion total of element type 'reliable' that exists in the all dataset. This parameter correct the infinite result that return the constant gamma.

In [None]:
#(!)# filename = 'spa_train.csv' or 'por_train.csv'
def return_zetta(num_D_train, pi, alpha, filename):
  df_rel = split_label_quality(PATH, Bsuffix + filename, 'reliable', save = False, old = True)
  num_rel = len(df_rel)
  proportion_rel = num_rel * (1 - pi) / num_D_train
  alpha_dec = float("{0:.2f}".format(alpha))
  proportion_rel_dec = float("{0:.2f}".format(proportion_rel))
  if alpha_dec == proportion_rel_dec:
    zetta = 1 / (1 - 1 / num_D_train)
  if alpha_dec != proportion_rel_dec:
    zetta = 1.0
  return zetta

This parameter returns the proportion of elements that used for evaluate the model predictions

In [0]:
def return_epsilon(pi, beta):
  epsilon = ((1/beta)-1)/((1/pi)-1)
  return epsilon

In [0]:
Ksuffix = 'K-spl_test_'
# For the total data used in the obtaining model, alpha is the percentaje of the data used to training model, and 1-apha is the data used to evaluated model.
#(!)# filename = Esuffix + language[:3] + '_~' + 'train.csv'
def split_test(PATH, filename, language, pi, alpha, beta, seed, save = True, old = True, suffix = Ksuffix): #Filename = 'train.csv' or 'test.csv'
  #(!)# filename_root = 'spa_~train.csv' or 'por_~train.csv'
  filename_root = filename.replace(Esuffix,"")
  if old == True and (PATH / (suffix + filename_root)).exists():
    df = pd.read_csv(PATH / (suffix + filename_root))
  if old == False or not(PATH / (suffix + filename)).exists():
    epsilon = return_epsilon(pi, beta)
    df = pd.read_csv(PATH / (filename))
    num_D_train = len(df) + 1
    zetta = return_zetta(num_D_train, pi, alpha, filename_root.replace("~",""))
    df_residual, df = train_test_split(df, test_size=int(zetta*epsilon*len(df)), random_state=seed, stratify=df.category)#Delete this if existing a error !!!#, stratify=df.category)
    if save == True:
      df.to_csv(PATH / (suffix + filename_root), index=False)
  return df


In [0]:
import unicodedata
import string
def normalize_title(title, digits_number, chars_number, test = False):
    title = unicodedata.normalize('NFKD', title.lower()).encode('ASCII', 'ignore').decode('utf8')
    # replace '--' or '-' or '/' with ''
    title = title.replace('-', ' ')
    title = title.replace('--', ' ')
    # split into tokens by white space
    tokens = title.split()
    empty = False
    if test == True:
      if len(tokens) == 0:
        empty = True
        tokens = ["<UNK>", "<UNK>", "<UNK>"]
    
    if test == False or test == True and empty == False:
      # remove punctuation from each token
      table = str.maketrans('', '', string.punctuation)
      #The string translate() method returns a string where each character is mapped to its corresponding character in the translation table.
      tokens = [w.translate(table) for w in tokens]
      # remove remaining tokens that are not alphabetic
      if digits_number==0:
        tokens = [word for word in tokens if word.isalpha()]
      if digits_number>0:
        tokens = [word for word in tokens if not(word.isdigit() and len(word)<=digits_number)]
      # remove all words that only contains one char.
      tokens = [word for word in tokens if len(word)>=chars_number]
      # make lower case for each one of our train data.
      tokens = [word.lower() for word in tokens]
    
    # convert into a line
    title_text = ' '.join(tokens)
    return title_text

We split the data in two part, one of parts is to training the model, and the other is to evaluate it.

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
Fsuffix = 'F-nor_tit_lab_'
pd.options.mode.chained_assignment = None

def normalize_title_label(PATH, filename, digits_number, chars_number, save = True, old = True, test = False, suffix = Fsuffix): #Filename = 'train.csv' or 'test.csv'
  if old == True and (PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / (suffix + filename)) 
  
  if old == False or not((PATH / (suffix + filename)).exists()):
    df = pd.read_csv(PATH /  filename) 
    for i in range(len(df)):
      df['title'][i] = normalize_title(df['title'][i], digits_number, chars_number, test = test) # para len_title Pendiente para hallar la cantidad de elementos recomendada en este apartado
    if test == False: #Sólo elimina los datos vacíos para el entrenamiento, y para el test, los deja intactos.
      df = df[~df.title.isna() & (df.title != 'nan') & (df.title != '')]
    if save == True:
      df.to_csv(PATH / (suffix + filename), index=False)
  return df


In [0]:
import unicodedata
import string
def trim_title(title, max_len): #This function put the lenght title in a 'max_len' chars.
    tokens = title.split()
    index_word = len(tokens[0])-1
    num_words = 0
    while index_word < max_len and num_words < len(tokens):
      num_words = num_words + 1
      if num_words < len (tokens):
        index_word = index_word + len(tokens[num_words]) + 1
    if num_words == 0:
      tokens = ['']
    if num_words > 0:
      tokens = tokens[:num_words]
    # convert into a line
    title_text = ' '.join(tokens)
    return title_text

Now we are to trim the title to the max lenght chars that used in the model.

In [0]:
FCsuffix = ''
pd.options.mode.chained_assignment = None
def trim_title_label(PATH, filename, max_len, save = True, suffix = FCsuffix): #Filename = 'train.csv' or 'test.csv'
  df = pd.read_csv(PATH /  filename) 
  for i in range(len(df)):
    df['title'][i] = trim_title(df['title'][i], max_len)
  df = df[~df.title.isna() & (df.title != 'nan') & (df.title != '')]
  if save == True:
    df.to_csv(PATH / (suffix + filename), index=False)
  return df

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
Gsuffix = 'G-sor_dat_'
def sort_data(PATH, filename, save = True, old = True, suffix = Gsuffix):
  if old == True and (PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / (suffix + filename))
  if old == False or not (PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / filename)
    df = df.sort_values(by =['category', 'label_quality', 'title'])
    if save == True:
      df.to_csv(PATH / (suffix + filename), index=False)
  return df


In [0]:
from numpy import array
from pickle import dump
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from random import randint

# save tokens to file, one dialog per line
def save_segments(segments, filename):
	data = '\n'.join(segments)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
def load_file(filename):
  file=open(filename,'r')
  data=file.read()
  file.close()
  return data

In [0]:
def return_maxchars(language, save = True, old = True): #Maximum number of chars of the tittle's Mercado Libre
  import numpy as np
  FAsuffix = 'FA-ret_maxchars_'
  if old == True and (PATH / (FAsuffix + language[:3] +'.dat')).exists():
    maxchars_model = np.loadtxt(PATH / (FAsuffix + language[:3] +'.dat'))
  if old == False or not(PATH / (FAsuffix + language[:3] +'.txt')).exists():
    filenames = ['train.csv','~train.csv']
    maxchars = []
    for step in filenames:
      if step == filenames[0]:
        filecontent = load_file(PATH / (Fsuffix + Esuffix + language[:3] + '_' + step))
      if step == filenames[1]:
        filecontent = load_file(PATH / (Fsuffix + Ksuffix + language[:3] + '_' + step))
      titles = filecontent.split('\n')
      len_titles = []
      for i in range(len(titles)):
        len_titles.append(len(titles[i]))
      maxchars.append(max(len_titles))
    maxchars_model = np.array([max(maxchars)])
    if save == True:
      np.savetxt(PATH / (FAsuffix + language[:3] +'.dat'), maxchars_model, fmt='%i')
  return maxchars_model[0]

In [0]:
#The parameter zigma is to variate the lenght of the maxchars in the iterations model, for found the best score.
def return_avgchars(language, zigma = 1.0, save = True, old = True): #Mean or average number of chars of the tittle's Mercado Libre
  import numpy as np
  FBsuffix = 'FB-ret_avgchars_'
  if old == True and (PATH / (FBsuffix + language[:3] +'.dat')).exists():
    avgchars_model = np.loadtxt(PATH / (FBsuffix + language[:3] +'.dat'))
  if old == False or not(PATH / (FBsuffix + language[:3] +'.txt')).exists():
    filenames = ['train.csv','~train.csv']
    avgchars = []
    for step in filenames:
      if step == filenames[0]:
        filecontent = load_file(PATH / (Fsuffix + Esuffix + language[:3] + '_' + step))
      if step == filenames[1]:
        filecontent = load_file(PATH / (Fsuffix + Ksuffix + language[:3] + '_' + step))
      titles = filecontent.split('\n')
      len_titles = []
      for i in range(len(titles)):
        len_titles.append(len(titles[i]))
      avgchars.append(int(sum(len_titles)/len(len_titles)))
    avgchars_model = np.array([max(avgchars)])
    if save == True:
      np.savetxt(PATH / (FBsuffix + language[:3] +'.dat'), avgchars_model, fmt='%i')
  return zigma*avgchars_model[0]

In [0]:
def return_maxdimX(chars_number, maxchars):
  #Now we find the maximum number of word that could be inside of the title according the minimum chars_numbers that a word can contains.
    maxdimX = int(maxchars/(chars_number+1))
    return maxdimX

In [0]:
Hsuffix = 'H-gen_seg_'
filename_root = 'train.csv'
#(!)# filename = Fsuffix + Esuffix + language[:3] + '_' + filename_root
def generate_segments(PATH, filename, chars_number, maxchars, type_data = 'train', save = True, old = True, suffix = Hsuffix):
  segments = list()
  if old == True and (PATH / (suffix + filename)).exists():
    filecontent = load_file(PATH / (suffix + filename))
    segments = filecontent.split('\n')
  if old == False or not(PATH / (suffix + filename)).exists():
    df = pd.read_csv(PATH / (filename))
    array_titles = array(df['title'])
    if type_data == 'train':
      array_categories = array(df['category'])
    iterations = len(df)
    maxdimX = return_maxdimX(chars_number, maxchars)
    for i in range(iterations):
      titles = str(array_titles[i])
      elements = titles.count(' ') + 1
      if elements < maxdimX:
        for j in range(maxdimX - elements):
          titles = titles + ' <NULL>'
      if type_data == 'train':
        segments.append( titles + ' ' + str(array_categories[i]) )
      if type_data == 'test':
        segments.append( titles )
    if save == True:
      save_segments(segments, PATH / (suffix + filename))
  return segments


**Encode Segments**

Now, we are setting the word_index to our preference where the special tokens will be first in the dictionary, then the list of categories, and finally the our vocabulary words.

In [0]:
Isuffix = 'I-gen_tok_'
special_tokens_label = ['<NULL>','<UKN>'] #'<NULL>' and '<UKN>' are special tokens reservered to analysis.
special_tokens = len(special_tokens_label)
#(!) filename = Fsuffix + Esuffix + language[:3] + '_' + 'train.csv',
def generate_tokenizer(PATH, filename, num_words, chars_number, maxchars, type_data = 'train', save = True, 
                       old = True, return_parameter = 'tokenizer, sequences', 
                       special_tokens_label = special_tokens_label, suffix = Isuffix):
  if old == True and (PATH / (Isuffix + filename)).exists():
    segments = generate_segments(PATH, filename, chars_number, maxchars, type_data = type_data, save = True, old = True)
    tokenizer = load(open(PATH / (Isuffix + filename), 'rb'))
    sequences = tokenizer.texts_to_sequences(segments)
  if old == False or not(PATH / (Isuffix + filename)).exists():
    segments = generate_segments(PATH, filename, chars_number, maxchars, type_data = type_data, save = True, old = True)#False) ID xxx
    special_tokens = len (special_tokens_label)
    list_categories = return_categories()
    number_categories = return_number_categories()
    tokenizer = Tokenizer(num_words = special_tokens + number_categories + num_words, filters = '', lower = False, split = ' ', char_level = False,
                          oov_token = '<UKN>', document_count = 0)
    tokenizer.fit_on_texts(segments)
    word_keys = list(tokenizer.word_index)
    word_index = {} #Notice that the id: 0 is a id reservered for the tokenizer function.
    #word_index['<NULL>'] = 1
    #word_index['<UKN>'] = 2
    for i in range(special_tokens):
      word_index[special_tokens_label[i]] = i + 1
    for i in range(number_categories):
      word_index[list_categories[i]] = i + 1 + special_tokens
    for i in range(len(word_keys)-special_tokens): #num_words):
      if word_keys[i+special_tokens] not in word_index:
        word_index[word_keys[i+special_tokens]] = i + 1 + special_tokens + number_categories #[i+special_tokens]] = i + 1 + special_tokens + number_categories
    tokenizer.word_index = word_index
    sequences = tokenizer.texts_to_sequences(segments)
    if save == True:
      dump(tokenizer, open(PATH / (Jsuffix + filename), 'wb'))  
  if return_parameter == 'tokenizer':
    return tokenizer
  if return_parameter == 'vocabulary_len':
    vocabulary_len = len(tokenizer.word_index) + 1
    return vocabulary_len
  if return_parameter == 'tokenizer, sequences' or return_parameter == 'tokenizer' or return_parameter == 'vocabulary_len':
    return tokenizer, sequences


In [0]:
import numpy as np
Jsuffix = 'J-gen_seq_'
#(!) filename = Fsuffix + Esuffix + 'spa_' + 'train.csv'
def generate_sequences(PATH, filename, language, num_words, digits_number, chars_number, type_data = 'train', save = True, old = True,
                       return_parameter = 'Xy', suffix = Jsuffix):
  list_sequences = []
  maxchars = return_avgchars(language) #maxchars = return_maxchars(language)
  tokenizer, sequences = generate_tokenizer(PATH, filename, num_words, chars_number, maxchars, type_data = type_data, save = save, old = old, suffix = Isuffix) #save = True, old = True, suffix = Isuffix)
  for i in range(len(sequences)):
    for j in range(len(sequences[i])):
      list_sequences.append(sequences[i][j])
  maxdimX = return_maxdimX(chars_number,maxchars)
  if type_data == 'test':
    array_sequences = np.array(list_sequences).reshape(-1, maxdimX)
  if type_data == 'train':
    array_sequences = np.array(list_sequences).reshape(-1, maxdimX+1)
    if return_parameter == 'X': #Ever returns X vector.
      array_sequences = array_sequences[:,:-1]
    if return_parameter == 'y': #Ever returns y vector.
      array_sequences = array_sequences[:,-1]
  if return_parameter == 'vocabulary_len':
    vocabulary_len = generate_tokenizer(PATH, filename, num_words, chars_number, maxchars, save = save, old = True, return_parameter = 'vocabulary_len', suffix = Isuffix) #Aquí dejamos old = True para que no vuelva a generar el tokenizer que ya generó en el inicio de este bloque. #save = True, old = True, return_parameter = return_parameter, suffix = Isuffix)
    return vocabulary_len
  if return_parameter == 'Xy' or return_parameter == 'X' or return_parameter == 'y':
    return array_sequences

In [0]:
def return_num_D(filename_root, language):
  if (PATH / (Bsuffix + language [:3] + '_' + filename_root)).exists():
    df = pd.read_csv(PATH / (Bsuffix + language [:3] + '_' + filename_root))
  if not (PATH / (Bsuffix + language [:3] + '_' + filename_root)).exists():
    df = split_language(PATH, filename_root, language, old = True)
  num_D = len(df)
  return num_D


In [0]:
#(!)# filename_root = 'train.csv'
#filename_root = 'train.csv'
def generate_data_model(PATH, filename_root, language, psi, beta, alpha, zigma, chars_number, digits_number, seed, save = True, old = True):
  if old == True and (PATH / (Fsuffix + Esuffix + language[:3] + '_' + filename_root)).exists() and (PATH / (Fsuffix + Ksuffix + language[:3] + '_' + filename_root)).exists():
    print("\t\t\t\tgenerate_data_model() previously executed")
  if old == False or not (PATH / (Fsuffix + Esuffix + language[:3] + '_' + filename_root)).exists() or not (PATH / (Fsuffix + Ksuffix + language[:3] + '_' + filename_root)).exists():
    pi = psi * beta
    
    df = split_data(PATH, filename_root, language, pi, alpha, seed, save = save, old = old)
    print("\t\t\t\tsplit_data() finished\n",df.head(n = 5))
    df = normalize_title_label(PATH , Esuffix + language[:3] + '_' + filename_root, digits_number, chars_number, save = save, old = save)
    print("\t\t\t\tnormalize_title_label() data finished\n",df.head(n = 5))
    df = split_test(PATH, Esuffix + language[:3] + '_~' + filename_root, language,  pi, alpha, beta, seed, save = save, old = old) 
    print("\t\t\t\tsplit_test() finished\n",df.head(n = 5))
    df = normalize_title_label(PATH , Ksuffix + language[:3] + '_~' + filename_root, digits_number, chars_number, save = save, old = old) 
    print("\t\t\t\tnormalize_title_label() test finished\n",df.head(n = 5))

    avgchars = return_avgchars(language, zigma = zigma, save = True, old = False) #Mean or average number of chars of the tittle's Mercado Libre
    df = trim_title_label(PATH, Fsuffix + Esuffix + language[:3] + '_' + filename_root, max_len = avgchars, save = True) #Filename = Fsuffix + Esuffix + language[:3] + '_' + filename_root or Fsuffix + Ksuffix + language[:3] + '_~' + filename_root
    print("\t\t\t\ttrim_title_label() data finished\n",df.head(n = 5))
    df = trim_title_label(PATH, Fsuffix + Ksuffix + language[:3] + '_~' + filename_root, max_len = avgchars, save = True)
    print("\t\t\t\ttrim_title_label() test finished\n",df.head(n = 5))
    
    #This line was deleted because it isn't necessary for the trainning model
    #df = sort_data(PATH, Fsuffix + Esuffix + language[:3] + '_' + filename_root, save = save, old = old)
    #print("\t\t\t\tsort_data() finished\n",df.head(n = 5))
    # It's not necessary sort the test train data '~train.csv'


In [0]:
#(!)#  filename_root = 'train.csv'
def return_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, return_parameter = 'train', save = True, old = True):
  #Trainning part
  if return_parameter == 'train':
    #If the accuracy reduces, then do execute this line.
    #array_sequences = generate_sequences(PATH, Gsuffix + Fsuffix + Esuffix + language[:3] + '_' + filename_root, language, num_words, digits_number, chars_number, type_data = 'train', return_parameter = 'Xy', save = save, old = old)
    array_sequences = generate_sequences(PATH, Fsuffix + Esuffix + language[:3] + '_' + filename_root, language, 
                                         num_words, digits_number, chars_number, type_data = 'train',
                                         return_parameter = 'Xy', save = save, old = old)
  #Test part
  if return_parameter == 'test':
    array_sequences = generate_sequences(PATH, Fsuffix + Ksuffix + language[:3] + '_~' + filename_root, language,
                                         num_words, digits_number, chars_number, type_data = 'test',
                                         return_parameter = 'Xy', save = save, old = old)
  if return_parameter == 'train':
    X, y = array_sequences[:,:-1], array_sequences[:,-1]
    return X, y
  if return_parameter == 'test':
    X = array_sequences
    return X

**Obtaining the model**

In [0]:
#(!)# filename_root = 'train.csv'

kappa = 2
hidden_size = 50
activation = ['relu', 'softmax']
loss = 'categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']
batch_size = 128
epochs = 100

def generate_model(PATH, filename_root, language, num_words, digits_number, 
                   chars_number, kappa = kappa, hidden_size = hidden_size, activation = activation, loss = loss, optimizer = optimizer, 
                   metrics = metrics, batch_size = batch_size, epochs = epochs, old = True):
  
  if (PATH / ('model_' + language[:3] + '_' + filename_root[:-4] + '.h5')).exists() and old == True:
    print("\t\t\t\t\t\t\t\tgenerate_model() previously executed")

  if not(PATH / ('model_' + language[:3] + '_' + filename_root[:-4] + '.h5')).exists() or old == False:
    X, y = return_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, 
                            return_parameter = 'train', save = True, old = True)
    
    number_categories = return_number_categories()
    vocabulary_len = special_tokens + number_categories + num_words + 1
    y = to_categorical(y, num_classes = vocabulary_len)
    seq_length = X.shape[1]


    # define model
    model = Sequential()
    model.add(Embedding(vocabulary_len, hidden_size, input_length = seq_length))
    model.add(LSTM(int(kappa*hidden_size), return_sequences = True))
    model.add(LSTM(int(kappa*hidden_size)))
    model.add(Dense(int(kappa*hidden_size), activation = activation[0]))
    model.add(Dense(vocabulary_len, activation = activation[1]))
    print(model.summary())


    # Compile model
    model.compile(loss = loss, optimizer = optimizer, metrics = metrics)
    # fit model
    model.fit(X, y, batch_size = batch_size, epochs = epochs)

    # save the model
    model.save(PATH / ('model_' + language[:3] + '_' + filename_root[:-4] + '.h5'))
    print("\t\t\t\t\t\t\t\tgenerate_model() executed")


Now we are creating the functions that return a range of number for each our parameter in the iterations of the model.

In [0]:
import numpy as np

def F(n): #Fibonacci function
    if n == 0: return 0
    elif n == 1: return 1
    else: return F(n-1)+F(n-2)

def fj(j): #Proportion Fibonacci
  fj = F(j+2)/F(j+3)
  return fj

def return_fibo(a, b, N, return_int = False): #Return a range in fibo numbers.
  Bj = np.zeros(N)
  if return_int == True:
    Bj[0] = int(b)
    Bj[N-1] = int(a)
  if return_int == False:
    Bj[0] = b
    Bj[N-1] = a
  n = N-2
  bj = b
  for i in range(n):
    bj = a + (bj - a)*(1 - fj(i))
    if return_int == True:
      bj = int(bj)
    Bj[i+1] = bj
  Bi = np.zeros(N)
  for i in range(N):
    Bi[i] = Bj[N-1-i]
  if N == 1:
    Bi[0] = a
  Bi = np.unique(Bi)
  if b - a < 0:
    Bi = np.flip(Bi)
  return Bi

def return_range(a, b, N, return_int = False): #Return a range in uniform distribucion
  if return_int == True:
    range_parameter = np.arange(int(a),int(b),(b-a)/(N-1))
  range_parameter = np.arange(a,b,(b-a)/(N-1))
  range_parameters = np.zeros(N)
  for i in range(N-1):    
    range_parameters[i] = range_parameter[i]
  range_parameters[-1] = b
  if return_int == True:
    for i in range(len(range_parameters)):
      range_parameters[i] = int(range_parameters[i])
  if len(range_parameters) == 2:
    range_parameters[1] = b
  range_parameters = np.unique(range_parameters)
  if b - a < 0:
    range_parameters = np.flip(range_parameters)
  return range_parameters

def return_range_parameter(value, range_value, return_int = False): #Returns a range based to a value recomended and if relative proportions.
  steps = range_value[2]
  range_parameter = np.arange(int(value*range_value[0]),int(value*range_value[1]),int(value*(range_value[1]-range_value[0])/(steps-1)))
  range_parameters = np.zeros(steps)
  for i in range(steps-1):    
    range_parameters[i] = range_parameter[i]
  range_parameters[-1] = int(value*range_value[1])
  if return_int == True:
    for i in range(len(range_parameters)):
      range_parameters[i] = int(range_parameters[i])
  if len(range_parameters) == 2:
    range_parameters[1] = value*range_value[1]
  return np.unique(range_parameters)



**Obtaining the Best Accuracy Model**

In [0]:
# THIS AREA HELPS US TO FIND THE BEST PARAMETER THAT RETURNS THE BEST BACC ACCURACY
import time
def predict_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, return_y = False, return_parameter = 'train'):
  if return_parameter == 'train':
    X, y = return_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, return_parameter = 'train', save = True, old = True)
  if return_parameter == 'test':
    X = return_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, return_parameter = 'test', save = True, old = True)
  model = load_model(PATH / ('model_' + language[:3] + '_' + filename_root[:-4] + '.h5'))
  yhat = model.predict_classes(X, verbose=0)
  if return_y == False:
    return yhat
  if return_y == True:
    return y, yhat

from sklearn.metrics import balanced_accuracy_score
def return_accuracy(y, yhat, type = 'BACC'):
  if type == 'BACC':
    accuracy = balanced_accuracy_score(y, yhat)
  return accuracy

#The suffix to name the file that contains the information of each model iterated in the previous cell code.
Osuffix = 'models_accuracy_'
filename_root = 'train.csv'
languages = ['portuguese', 'spanish']
seed = 8 # Seed used to the trainning model.
psi = 1/200 # Proportion of the total data data used in the analysis of the model and for each language.
beta = 0.30 # Proportion of the all data used in this model, where 1-beta is the proportion the data to evaluate model.

range_alpha = return_fibo(0.25,0.3,1) #Proportion of elements type 'reliable' that the user want to choose in the model.
range_zigma = return_fibo(0.75,1.5,1) #Proportion of the average chars lenght title's Mercado Libre, where zigma = 1.0 generate that the max chars lenght used in the model is equal to the average chars in the data.
range_chars_number = return_fibo(5,8,1, return_int = True) #The minimun chars that a word can contains in the learning model.
range_digits_number = return_fibo(0,8,1, return_int = True) #The minimun digits that a word can contains in the learning model.

range_num_words = return_fibo(13000,21000,1, return_int = True) #The maximum number of words that the model learned.
range_kappa = return_fibo(1.5,3,1) #The proportion of the total hidden shapes in the neural network
range_hidden_size = return_fibo(125,150,1, return_int = True) 
range_batch_size = return_fibo(64, 128, 1, return_int = True)
range_epochs = return_fibo(250,300,1, return_int = True)

print("<--------------------------------------MODELS PARAMETERS-------------------------------------->")
print("FILENAME OUT:", Osuffix + filename_root)
print("LANGUAGES:",languages)
print("SEED:",seed)
print("PSI:",psi)
print("BETA:",beta)
for language in languages:
  print("LANGUAGE:", language)
  for alpha in range_alpha:
    alpha = float(alpha)
    print("\tALPHA:",alpha)
    for zigma in range_zigma:
      zigma = float(zigma)
      print("\t\tZIGMA:",zigma)
      for chars_number in range_chars_number:
        chars_number = int(chars_number)
        print("\t\t\tCHARS_NUMBER:",chars_number)
        for digits_number in range_digits_number:
          digits_number = int(digits_number)
          print("\t\t\t\tDIGITS_NUMBER:",digits_number)
          repeat_data = 0
          repeat_model = 0
          iterbreak = False
          if (PATH / (Osuffix + language[:3] + '_' + filename_root)).exists():
            df = pd.read_csv(PATH / (Osuffix + language[:3] + '_' + filename_root))
            df_data = df.loc[(df['seed'] == seed) & (df['psi'] == psi) & (df['beta'] == beta) & (df['alpha'] == alpha) & (df['zigma'] == zigma) & (df['chars_number'] == chars_number) & (df['digits_number'] == digits_number)] #This line generates erros, I don't know the reason & (df['digits_number'] == digits_number)
            repeat_data = len(df_data)
            if repeat_data > 0:
              for i in range(len(range_num_words)):
                for h in range(len(range_kappa)):
                  for j in range(len(range_hidden_size)):
                    for k in range(len(range_batch_size)):
                      for l in range(len(range_epochs)):
                        df_model = df.loc[(df['seed'] == seed) & (df['psi'] == psi) & (df['beta'] == beta) & (df['alpha'] == alpha) & (df['zigma'] == zigma) & (df['chars_number'] == chars_number) & (df['digits_number'] == digits_number) & (df['num_words'] == int(range_num_words[i])) & (df['kappa'] == float(range_kappa[h])) & (df['hidden_size'] == int(range_hidden_size[j])) & (df['batch_size'] == int(range_batch_size[k])) & (df['epochs'] == int(range_epochs[l]))]
                        repeat_model += len(df_model)
                        if (len(df_model) == 0):
                          iterbreak = True
                          break
                      if iterbreak == True:
                        break
                    if iterbreak == True:
                      break
                  if iterbreak == True:
                    break
                if iterbreak == True:
                  break
          if iterbreak == False and repeat_model > 0:
            print("\t\t\t\t\tgenerate_data_model() and generate_model() previously executed")
          if iterbreak == True or iterbreak == False and repeat_model == 0:
            clocktime_data = time.time()
            generate_data_model(PATH, 'train.csv', language, psi, beta, alpha, zigma, chars_number, digits_number, seed, save = True, old = False)
            time_data = time.time() - clocktime_data
            activation = ['relu', 'softmax']
            activation0 = activation[0]
            activation1 = activation[1]
            loss = 'categorical_crossentropy'
            optimizer = 'adam'
            metrics = ['accuracy']
            for num_words in range_num_words:
              num_words = int(num_words)
              print("\t\t\t\t\tNUM_WORDS:",num_words)
              for kappa in range_kappa:
                kappa = float(kappa)
                print("\t\t\t\t\tKAPPA:",kappa)
                for hidden_size in range_hidden_size:
                  hidden_size = int(hidden_size)
                  print("\t\t\t\t\t\tHIDDEN_SIZE:",hidden_size)
                  for batch_size in range_batch_size:
                    batch_size = int(batch_size)
                    print("\t\t\t\t\t\t\tBATCH_SIZE:", batch_size)
                    for epochs in range_epochs:
                      epochs = int(epochs)
                      print("\t\t\t\t\t\t\t\tEPOCHS:", epochs)
                      repeat_model = 0
                      if (PATH / (Osuffix + language[:3] + '_' + filename_root)).exists():
                        df = pd.read_csv(PATH / (Osuffix + language[:3] + '_' + filename_root))
                        df_model = df.loc[(df['seed'] == seed) & (df['psi'] == psi) & (df['beta'] == beta) & (df['alpha'] == alpha) & (df['zigma'] == zigma) & (df['chars_number'] == chars_number) & (df['digits_number'] == digits_number) & (df['num_words'] == num_words) & (df['kappa'] == kappa) & (df['hidden_size'] == hidden_size) & (df['batch_size'] == batch_size) & (df['epochs'] == epochs)]
                        repeat_model = len(df_model)
                      if (repeat_data > 0 and repeat_model > 0):
                        print("\t\t\t\t\t\t\t\t\t generate_model() previously executed")
                      
                      if not (repeat_data > 0 and repeat_model > 0):
                        clocktime_model = time.time()
                        generate_model(PATH, filename_root, language, num_words, digits_number, chars_number, kappa = kappa, hidden_size = hidden_size, activation = activation, loss = loss, optimizer = optimizer, metrics = metrics, batch_size = batch_size, epochs = epochs, old = False)
                        
                        time_model = time.time() - clocktime_model
                        
                        y, yhat = predict_sequences(PATH, filename_root, language, num_words, digits_number, chars_number, return_y = True, return_parameter = 'train')
                        
                        accuracy = return_accuracy(y, yhat, type = 'BACC')
                        
                        dictionary_model_parameter = {"accuracy":[accuracy], "time":[(time_data + time_model) / 60], "seed":[seed], "psi":[psi], 
                                                      "beta":[beta], "alpha":[alpha], "zigma":[zigma], "chars_number":[chars_number], 
                                                      "digits_number":[digits_number], "num_words":[num_words], "kappa":[kappa], "hidden_size":[hidden_size], 
                                                      "activation0":[activation0], "activation1":[activation1], "loss":[loss], "optimizer":[optimizer], "metrics":[metrics[0]], 
                                                      "batch_size":[batch_size], "epochs":[epochs]}
                        
                        model_parameter = pd.DataFrame(dictionary_model_parameter) 

                        if (PATH / (Osuffix + language[:3] + '_' + filename_root)).exists():
                          models_parameters = pd.read_csv(PATH / (Osuffix + language[:3] + '_' + filename_root))
                          models_parameters = pd.concat([models_parameters, model_parameter])
                          models_parameters.to_csv(PATH / (Osuffix + language[:3] + '_' + filename_root), index = False)

                        if not(PATH / (Osuffix + language[:3] + '_' + filename_root)).exists():
                          model_parameter.to_csv(PATH / (Osuffix + language[:3] + '_' + filename_root), index = False)

                        print(model_parameter.head(n = 1))
                    
print("<----------------------------------MODELS PARAMETERS FINISHED--------------------------------->")  
for language in languages:
  models_parameters = pd.read_csv(PATH / (Osuffix + language[:3] + '_' + filename_root))
  models_parameters = models_parameters.sort_values(by = ['accuracy', 'time'], ascending = [False, True])
  print("\nlanguage:",language)
  print(models_parameters.head(n = len(models_parameters)))


In [0]:
# Now we are going to create the best model based in the best accuracy that we found in the model.
filename_root = 'train.csv'
languages = ['portuguese', 'spanish']

print("<----------------------------------BEST PARAMETERS MODEL FOUNDED----------------------------------->")
for language in languages:
    len_models_parameters = 0
    if (PATH / (Osuffix + language[:3] + '_' + filename_root)).exists:
      models_parameters = pd.read_csv(PATH / (Osuffix + language[:3] + '_' + filename_root))
      len_models_parameters = len(models_parameters)
    if len_models_parameters > 0:
      if len_models_parameters > 1:
        models_parameters = models_parameters.sort_values(by = ['psi', 'accuracy', 'time'], ascending = [False, False, True])
      seed = models_parameters['seed'][0] #5 # Seed used to the trainning model.
      psi = models_parameters['psi'][0] #1/100 # Proportion of the total data data used in the analysis of the model.
      beta = models_parameters['beta'][0] #0.80 # Proportion of the all data used in this model, where 1-beta is the proportion the data to evaluate model.
      alpha = models_parameters['alpha'][0] #2*0.04909269508679993 # Proportion of the total data that will use with label : 'reliable' for the training model. 
      zigma = models_parameters['zigma'][0] #1.0 Proportion of the average of total maxchars that you want use in your model. Eg. zigma = 1.0 -> maxchars = avgchars (~67chars)
      chars_number = models_parameters['chars_number'][0] #5 #Only it will show the words that are a minimum length of 2 chars.
      digits_number = models_parameters['digits_number'][0] #0 #Only it will show the numbers that are greater than 'digits_number'
      num_words = models_parameters['num_words'][0] #10000 #Set the number of words in your vocabulary. If num_words=='none', this will not have limits in its length.
      kappa = models_parameters['kappa'][0] #50
      hidden_size = models_parameters['hidden_size'][0] #50
      activation0 = models_parameters['activation0'][0] #'relu'
      activation1 = models_parameters['activation1'][0] #'softmax'
      activation = [activation0, activation1] #['relu', 'softmax']
      loss = models_parameters['loss'][0] #'categorical_crossentropy'
      optimizer = models_parameters['optimizer'][0] #'adam'
      metrics = [models_parameters['metrics'][0]] #['accuracy']
      batch_size = models_parameters['batch_size'][0] #128
      epochs = models_parameters['epochs'][0] #100
      print("language:",language)
      print(models_parameters.head(n = 1))
      generate_data_model(PATH, filename_root, language = language, psi = psi, beta = beta, alpha = alpha, zigma = zigma, chars_number = chars_number, digits_number = digits_number, seed = seed, save = True, old = False)
      generate_model(PATH, filename_root, language = language, num_words = num_words, digits_number = digits_number, chars_number = chars_number, kappa = kappa, hidden_size = hidden_size, activation = activation, loss = loss, optimizer = optimizer, metrics = metrics, batch_size = batch_size, epochs = epochs, old = False)


In [0]:
# Función para separar un archivo final tipo 'test.csv' por lenguaje y poder indexarlo
Lsuffix = 'L-tra_lan_spl_'
filename_test_root = 'test.csv'
languages = ['portuguese', 'spanish']
def transf_language_split(PATH = PATH, filename_test_root = filename_test_root , languages = languages, return_dataframe = languages[0], save = True, old = True, Lsuffix = Lsuffix):
  for language in languages:
    suffix = Lsuffix + language[:3] + '_'
    if old == True and (PATH / (suffix + filename_test_root)).exists():
        df = pd.read_csv(PATH / (suffix + filename_test_root))
    if old == False or not(PATH / (suffix + filename_test_root)).exists():
      df = pd.read_csv(PATH / filename_test_root)
      df = df[df['language'] == language]
      if save == True:
        df.to_csv(PATH / (suffix + filename_test_root), index=False)
  if return_dataframe != '':
    language = return_dataframe
    df = pd.read_csv(PATH / (Lsuffix + language[:3] + '_' + filename_test_root))
    return df
  if return_dataframe == '':
    print("trans_language_split() executed")



In [0]:
#separando data evaluativa en el primer idioma
filename_test_root = 'test.csv'
languages = ['portuguese', 'spanish']
df = transf_language_split(PATH = PATH, filename_test_root = filename_test_root, languages = languages, return_dataframe = languages[0], save = True,
                           old = False)
df.head(25)

In [0]:
#separando data evaluativa en el segundo idioma
df = transf_language_split(PATH = PATH, filename_test_root = filename_test_root, languages = languages, return_dataframe = languages[1], save = True,
                           old = True)
df.head(25)

In [0]:
#(!)# ID E0 Área para convertir vector predicción en categorías
def generate_categories(yhat, special_tokens = special_tokens):
  list_categories = return_categories()
  y_hat_categories = []
  for i in range(len(yhat)):
    y_hat_categories.append(list_categories[yhat[i] - special_tokens -1])
  return y_hat_categories

In [0]:
#(!)# filename_test in = Nsuffix + Lsuffix + language[:3] + '_' + filename_test_root
#(!)# filename_test out = Msuffix + Nsuffix + Lsuffix + language[:3] + '_' + filename_test_root
Msuffix = 'M-tra_lan_con_'
filename_test = filename_test_root
languages = ['portuguese', 'spanish']
def transf_language_concat(PATH = PATH, filename_test = filename_test, languages = languages, return_dataframe = True, save = True, old = True, suffix = Msuffix): 
  if old == True and (PATH / (suffix + filename_test)).exists():
      df = pd.read_csv(PATH / (suffix + filename_test))
  if old == False or not(PATH / (suffix + filename_test)).exists():
    df = pd.read_csv(PATH / (Nsuffix + Fsuffix + Lsuffix + languages[0][:3] + '_' + filename_test))
    for i in range(1,len(languages)):
      language = languages[i]
      df_lang = pd.read_csv(PATH / (Nsuffix + Fsuffix + Lsuffix + language[:3] + '_' + filename_test))
      df = pd.concat([df, df_lang])
    df = df.sort_values(by =['id']) #or by='id'
    if save == True:
      df.to_csv(PATH / (suffix + filename_test), index=False)
  if return_dataframe == False:
    print("transf_language_concat() executed")
  if return_dataframe == True:
    return df

In [0]:
#(!)# ID F0
#(!)# filename in = Lsuffix + language[:3] + '_' + 'test.csv'
def predict_test(PATH, filename, language, num_words, digits_number, chars_number):
  df = normalize_title_label(PATH, filename, digits_number, chars_number, save = True, old = True, test = True)
  avgchars = return_avgchars(language)
  df = trim_title_label(PATH, filename, max_len = avgchars, save = True)
  X = generate_sequences(PATH, Fsuffix + filename, language, num_words, digits_number, chars_number, type_data = 'test', save = True, old = True)
  model = load_model(PATH / ('model_' + language[:3] + '_' + 'train.h5'))
  yhat = model.predict_classes(X, verbose=0)
  return yhat

filename_test = 'test.csv'
languages = ['portuguese', 'spanish']
def predict_test_lang(PATH, filename_test, languages, num_words, digits_number, chars_number):
  y_hat_categories_lan = list()
  for language in languages:
    yhat = predict_test(PATH, Lsuffix + language[:3] + '_' + filename_test, language, num_words, digits_number, chars_number)
    y_hat_categories = generate_categories(yhat)
    y_hat_categories_lan.append(y_hat_categories) 
  return y_hat_categories_lan

Now we are return a matrix with each predictions for each language that contains the data.

In [0]:
filename_test = 'test.csv'
languages = ['portuguese', 'spanish']
y_hat_categories_lan = predict_test_lang(PATH, filename_test, languages, num_words, digits_number, chars_number)

In [0]:
#(!)# filename_test in = Lsuffix + language[:3] + '_' + 'test.csv'
#(!)# filename_test out = Nsuffix + Lsuffix + language[:3] + '_' + 'test.csv'
filename_test = 'test.csv'
columns = 'id'
Nsuffix = 'att_col_'
def attach_column(y_hat_categories, PATH = PATH, filename_test = filename_test, columns = columns, 
                  save = True, old = True, suffix = Nsuffix):
  if old == True and (PATH / (suffix + filename_test)).exists():
    df = pd.read_csv(PATH / filename_test)
  if old == False or not (PATH / (suffix + filename_test)).exists():
    label_attach = 'category'
    df = pd.read_csv(PATH / filename_test)
    df = df[[columns]]
    df = df.iloc[:,0:1]
    df[label_attach] = y_hat_categories
    if save == True:
      df_copy = df[['id','category']]
      df_copy = df.iloc[:,0:2]
      df_copy.to_csv(PATH / (suffix + filename_test), index=False)
      return df_copy
  return df

#(!)# filename_test_root in = 'test.csv'
filename_test_root = 'test.csv'
columns = 'id'
languages = ['spanish','portuguese']
# This part attach the columns with the results previously generated.
def attach_column_lang(y_hat_categories_lan, PATH = PATH, filename_test_root = filename_test_root, columns = columns, languages = languages, 
                       return_dataframe = languages[0], save = True, old = True):
  for i in range(len(languages)):
    language = languages[i]
    filename_test = Fsuffix + Lsuffix + language[:3] + '_' + filename_test_root
    df = attach_column(y_hat_categories_lan[i], PATH = PATH, filename_test = filename_test, columns = columns, save = save, old = old)
  if return_dataframe == '':
    print("attach_column_lang() executed")
  if return_dataframe in languages:
    language = return_dataframe
    df = pd.read_csv(PATH / (Nsuffix + Fsuffix + Lsuffix + language[:3] + '_' + filename_test_root))
    return df

In [0]:
filename_test_root = 'test.csv'
columns = 'id'
languages = ['portuguese', 'spanish']
# Attaching the spanish column
df_spa = attach_column_lang(y_hat_categories_lan, PATH, filename_test_root, columns, languages, 
                        return_dataframe = languages[0], save = True, old = True)
df_spa.head(25)

In [0]:
# Attaching the portuguese column
df_por = attach_column_lang(y_hat_categories_lan, PATH, filename_test_root, columns, languages, 
                        return_dataframe = languages[1], save = True, old = True)
df_por.head(25)

Now we generate the submission with contains all data predict and ordered with its original id. This a best way to used our limited processing recourses.

In [0]:
#Now we concat all data of both languages.
filename_test_root = 'test.csv'
df = transf_language_concat(PATH = PATH, filename_test = filename_test_root, languages = languages, 
                            return_dataframe = True, save = True, old = True) 
df.head(25)