In [1]:
import pandas as pd
import numpy as np
import re
import collections

In [2]:
# code
df = pd.read_csv (r"data.csv", on_bad_lines='skip', encoding= 'unicode_escape')
print(df.head())

                                             English  \
0  This weeks 2023 Spring Meetings of the World B...   
1  However many communities face social environme...   
2  How can policymakers promote the conditions th...   
3  In this episode of Voices in Development Kevin...   
4  Donovan has used macro and microeconomic model...   

                                             Spanish  
0  Las Reuniones de Primavera de 2023 del Grupo d...  
1  Sin embargo muchas comunidades enfrentan barre...  
2  Â¿CÃ³mo pueden los formuladores de polÃ­ticas ...  
3  En este episodio de Voices in Development Kevi...  
4  Donovan ha utilizado modelos macro y microecon...  


In [3]:
df.shape

(130, 2)

In [37]:
import spacy

In [5]:
spacy_english = spacy.load('en_core_web_sm')
spacy_spanish = spacy.load('es_core_news_sm')

In [6]:

df['English_tokens'] = df['English'].apply(lambda x: spacy_english(x))

In [None]:
def lemmatizer(token):
    return [lm.lemma_ for lm in token]

df['Eng_lemmats'] = df['English_tokens'].apply( lemmatizer )
df['Eng_lemmats'].head()

In [8]:
# split into words
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords


In [9]:

def clean_word (text):
    #print (text)
    tokens = word_tokenize(text)

    #tokens = spacy_english.tokenizer(text)
    #print (tokens)
    tokens = [w.lower() for w in tokens]

    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    
    words = [word for word in stripped if word.isalpha()]
    #print (words)
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words
    #print(words[:100])

In [10]:
df['cleaned_english'] = df['English'].apply( clean_word )
df['cleaned_english'].head()

0    [weeks, spring, meetings, world, bank, group, ...
1    [however, many, communities, face, social, env...
2    [policymakers, promote, conditions, overcome, ...
3    [episode, voices, development, kevin, donovan,...
4    [donovan, used, macro, microeconomic, models, ...
Name: cleaned_english, dtype: object

In [11]:
stop_words = set(stopwords.words('spanish'))
punctuation = set(string.punctuation)
def clean_word_spanish (text):

    # Tokenize the text
    doc = spacy_spanish.tokenizer(text)
    tokens = [token.text for token in doc]

    # Clean the text

    clean_tokens = []
    for token in tokens:
        if token.lower() not in stop_words and token not in punctuation:
            clean_tokens.append(token)

    return clean_tokens

In [12]:
df['cleaned_spanish'] = df['Spanish'].apply( clean_word )
df['cleaned_spanish'].head()

0    [las, reuniones, de, primavera, de, del, grupo...
1    [sin, embargo, muchas, comunidades, enfrentan,...
2    [pueden, los, formuladores, de, promover, las,...
3    [en, este, episodio, de, voices, development, ...
4    [donovan, ha, utilizado, modelos, macro, para,...
Name: cleaned_spanish, dtype: object

In [13]:
all_words_english = df["cleaned_english"].tolist()
all_words_spanish = df["cleaned_spanish"].tolist()


In [14]:
from collections import Counter

def find_vocabulary_count (my_corpus):
    my_list = []
    total_words = []
    for c, row in enumerate (my_corpus):
       #print ("row : \n ", row)
       for w in row:
            total_words.append (w)
            if w not in my_list:
                my_list.append (w)
    #print (my_list)
    counts = Counter(total_words)
    return my_list, counts

In [15]:


english_vocabulary, counts = find_vocabulary_count (all_words_english)
print (english_vocabulary)

['weeks', 'spring', 'meetings', 'world', 'bank', 'group', 'imf', 'focus', 'development', 'new', 'era', 'important', 'reminder', 'systemic', 'change', 'happen', 'overnight', 'strong', 'economic', 'requires', 'adequate', 'infrastructure', 'individual', 'access', 'markets', 'opportunities', 'however', 'many', 'communities', 'face', 'social', 'environmental', 'barriers', 'hobble', 'harsh', 'landscapes', 'remote', 'settings', 'constrain', 'crises', 'pandemic', 'food', 'insecurity', 'climate', 'set', 'back', 'hardearned', 'gains', 'policymakers', 'promote', 'conditions', 'overcome', 'constraints', 'take', 'advantage', 'role', 'transforming', 'labor', 'catalyzing', 'episode', 'voices', 'kevin', 'donovan', 'assistant', 'professor', 'economics', 'global', 'affairs', 'egc', 'affiliate', 'discusses', 'diverse', 'body', 'recent', 'work', 'unpacking', 'tools', 'needed', 'countries', 'make', 'marketdriven', 'transitions', 'overlapping', 'used', 'macro', 'microeconomic', 'models', 'explore', 'knowled

In [16]:


spanish_vocabulary, counts = find_vocabulary_count (all_words_spanish)
print (spanish_vocabulary)

['las', 'reuniones', 'de', 'primavera', 'del', 'grupo', 'banco', 'mundial', 'el', 'fmi', 'esta', 'semana', 'centradas', 'en', 'desarrollo', 'para', 'una', 'nueva', 'son', 'un', 'recordatorio', 'importante', 'que', 'cambio', 'ocurre', 'la', 'noche', 'fuerte', 'requiere', 'infraestructura', 'adecuada', 'acceso', 'individual', 'mercados', 'oportunidades', 'sin', 'embargo', 'muchas', 'comunidades', 'enfrentan', 'barreras', 'sociales', 'ambientales', 'obstaculizan', 'los', 'paisajes', 'hostiles', 'entornos', 'remotos', 'pueden', 'restringir', 'crisis', 'como', 'pandemia', 'inseguridad', 'alimentaria', 'retrasar', 'avances', 'obtenidos', 'con', 'tanto', 'esfuerzo', 'formuladores', 'promover', 'condiciones', 'superen', 'estas', 'limitaciones', 'aprovechar', 'papel', 'laborales', 'este', 'episodio', 'voices', 'development', 'kevin', 'donovan', 'profesor', 'asistente', 'asuntos', 'globales', 'afiliado', 'egc', 'analiza', 'su', 'diverso', 'cuerpo', 'trabajo', 'reciente', 'revela', 'herramientas'

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset

import random
import math

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer()

In [19]:
# code

def converter_en (row):
    bow = []
    for w in english_vocabulary:
        #print (w)
        if w in row:
            bow.append (1.0)
        else:
            bow.append (0.0)
    return np.array (bow)


def converter_es (row):
    bow = []
    for w in spanish_vocabulary:
        #print (w)
        if w in row:
            bow.append (1.0)
        else:
            bow.append (0.0)
    return np.array (bow)

In [20]:
df['converter_en'] = df['cleaned_english'].apply( converter_en)
print (df['converter_en'].head())

0    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: converter_en, dtype: object


In [21]:
df['converter_es'] = df['cleaned_spanish'].apply( converter_es )
print (df['converter_es'].head())

0    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
1    [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
2    [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...
3    [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
4    [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: converter_es, dtype: object


In [22]:
df.head()

Unnamed: 0,English,Spanish,English_tokens,Eng_lemmats,cleaned_english,cleaned_spanish,converter_en,converter_es
0,This weeks 2023 Spring Meetings of the World B...,Las Reuniones de Primavera de 2023 del Grupo d...,"(This, weeks, 2023, Spring, Meetings, of, the,...","[this, week, 2023, Spring, Meetings, of, the, ...","[weeks, spring, meetings, world, bank, group, ...","[las, reuniones, de, primavera, de, del, grupo...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,However many communities face social environme...,Sin embargo muchas comunidades enfrentan barre...,"(However, many, communities, face, social, env...","[however, many, community, face, social, envir...","[however, many, communities, face, social, env...","[sin, embargo, muchas, comunidades, enfrentan,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,How can policymakers promote the conditions th...,Â¿CÃ³mo pueden los formuladores de polÃ­ticas ...,"(How, can, policymakers, promote, the, conditi...","[how, can, policymaker, promote, the, conditio...","[policymakers, promote, conditions, overcome, ...","[pueden, los, formuladores, de, promover, las,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
3,In this episode of Voices in Development Kevin...,En este episodio de Voices in Development Kevi...,"(In, this, episode, of, Voices, in, Developmen...","[in, this, episode, of, Voices, in, Developmen...","[episode, voices, development, kevin, donovan,...","[en, este, episodio, de, voices, development, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4,Donovan has used macro and microeconomic model...,Donovan ha utilizado modelos macro y microecon...,"(Donovan, has, used, macro, and, microeconomic...","[Donovan, have, use, macro, and, microeconomic...","[donovan, used, macro, microeconomic, models, ...","[donovan, ha, utilizado, modelos, macro, para,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


class TranslationDataset(Dataset):
    def __init__(self, src_df, trg_df):
        self.src_df = src_df
        self.trg_df = trg_df
        
    def __len__(self):
        return len(self.src_df)
    
    
    def __getitem__(self, idx):
        
        src_sentence = self.src_df.iat[idx]
        trg_sentence = self.trg_df.iat[idx]
        

        
        return torch.tensor(src_sentence), torch.tensor(trg_sentence)


In [24]:
dataset = TranslationDataset(df['converter_en'], df['converter_es'])

In [25]:
train_ratio = 0.8
test_ratio = 0.1
valid_ratio = 0.1
batch_size = 8

In [26]:

dataset_size = len(dataset)
train_size = math.floor(dataset_size * train_ratio)
test_size = math.floor(dataset_size * test_ratio)
valid_size = dataset_size - train_size - test_size

print ("num of train samples : ", train_size)
print ("num of test samples : ", test_size)
print ("num of valid samples : ", valid_size)

num of train samples :  104
num of test samples :  13
num of valid samples :  13


In [27]:
indices = list(range(dataset_size))
random.shuffle(indices)

In [28]:

train_indices = indices[:train_size]
test_indices = indices[train_size:train_size+test_size]
valid_indices = indices[train_size+test_size:]
print (train_indices)
print (test_indices)
print (valid_indices)

[29, 68, 20, 3, 50, 15, 16, 127, 121, 74, 109, 89, 67, 11, 102, 34, 4, 114, 100, 104, 80, 55, 14, 21, 105, 115, 37, 1, 57, 126, 22, 62, 59, 103, 27, 17, 75, 2, 65, 101, 93, 123, 82, 36, 24, 129, 63, 99, 76, 26, 0, 41, 32, 56, 8, 120, 45, 47, 88, 60, 6, 110, 28, 128, 116, 25, 31, 18, 125, 30, 83, 9, 40, 23, 7, 44, 85, 51, 48, 91, 118, 49, 70, 77, 92, 64, 117, 46, 52, 39, 35, 98, 97, 58, 19, 112, 54, 87, 96, 124, 106, 111, 71, 53]
[66, 72, 5, 86, 122, 79, 10, 43, 94, 107, 108, 42, 81]
[69, 38, 13, 12, 33, 73, 78, 84, 61, 95, 119, 90, 113]


In [29]:
train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)
valid_dataset = Subset(dataset, valid_indices)

In [30]:

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [34]:
for batch_idx, (inputs, targets) in enumerate(train_loader):
    # Process the batch
    print ("English inputs : ", inputs)
    print ("Spanish outputs : ", targets)


English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Spanish outputs :  tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]], dtype=torch.float64)
English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 1., 1.]], dtype=torch.float64)
Spanish outputs :  tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1., 

In [35]:
for batch_idx, (inputs, targets) in enumerate(test_loader):
    # Process the batch
    print ("English inputs : ", inputs)
    print ("Spanish outputs : ", targets)


English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Spanish outputs :  tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.]], dtype=torch.float64)
English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Spanish outputs :  tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0.,

In [36]:
for batch_idx, (inputs, targets) in enumerate(valid_loader):
    # Process the batch
    print ("English inputs : ", inputs)
    print ("Spanish outputs : ", targets)

English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Spanish outputs :  tensor([[1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.]], dtype=torch.float64)
English inputs :  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Spanish outputs :  tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0.,