In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import dask.dataframe as dd
import random
import os
import tensorflow_datasets as tfds
import re

In [2]:
#dask dataframe
df = dd.read_csv('en-fr.csv')
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [3]:
#random shuffle
df = df.sample(frac=1,random_state=49)
df.head()

Unnamed: 0,en,fr
198611,The unique element about this work is the very...,L’élément unique de l’ouvrage est la nature mê...
52372,This summary highlights agri-food news trends ...,Le présent sommaire fait ressortir les tendanc...
118054,"In the same period there were 680,000 Loansome...",On rapporte 680 000 demandes Loansome Doc au c...
18034,• Coffee bars and shops increased in numbers t...,• Le nombre de cafés-bars et de cafés-restaura...
210207,An Overview of Canadian Insolvency Statistics ...,Un survol des statistiques sur l'insolvabilité...


In [4]:
#tot_len = len(df)
#22520376

- Lower
- english words in french col and viceversa
- just single dot "."

In [5]:
df['en_len'] = df['en'].str.len()
df['fr_len'] = df['fr'].str.len()

In [6]:
df.head()

Unnamed: 0,en,fr,en_len,fr_len
198611,The unique element about this work is the very...,L’élément unique de l’ouvrage est la nature mê...,73.0,67
52372,This summary highlights agri-food news trends ...,Le présent sommaire fait ressortir les tendanc...,170.0,211
118054,"In the same period there were 680,000 Loansome...",On rapporte 680 000 demandes Loansome Doc au c...,60.0,70
18034,• Coffee bars and shops increased in numbers t...,• Le nombre de cafés-bars et de cafés-restaura...,154.0,212
210207,An Overview of Canadian Insolvency Statistics ...,Un survol des statistiques sur l'insolvabilité...,76.0,91


In [7]:
#df.dropna().compute()

In [8]:
df['en'] = df['en'].str.lower()
df['fr'] = df['fr'].str.lower()
df.head()

Unnamed: 0,en,fr,en_len,fr_len
198611,the unique element about this work is the very...,l’élément unique de l’ouvrage est la nature mê...,73.0,67
52372,this summary highlights agri-food news trends ...,le présent sommaire fait ressortir les tendanc...,170.0,211
118054,"in the same period there were 680,000 loansome...",on rapporte 680 000 demandes loansome doc au c...,60.0,70
18034,• coffee bars and shops increased in numbers t...,• le nombre de cafés-bars et de cafés-restaura...,154.0,212
210207,an overview of canadian insolvency statistics ...,un survol des statistiques sur l'insolvabilité...,76.0,91


In [9]:
df = df[df['en_len']>1]
df = df[df['fr_len']>1]

In [10]:
#https://www.rocketlanguages.com/french/lessons/french-accents

def check_fr_en(sent):
    if re.findall("[Çéâêîôûàèùëïü]",sent):
        return 1
    return 0

df['fr_in_en'] = df['en'].map(check_fr_en)

In [11]:
df.head()

Unnamed: 0,en,fr,en_len,fr_len,fr_in_en
198611,the unique element about this work is the very...,l’élément unique de l’ouvrage est la nature mê...,73.0,67,0
52372,this summary highlights agri-food news trends ...,le présent sommaire fait ressortir les tendanc...,170.0,211,0
118054,"in the same period there were 680,000 loansome...",on rapporte 680 000 demandes loansome doc au c...,60.0,70,0
18034,• coffee bars and shops increased in numbers t...,• le nombre de cafés-bars et de cafés-restaura...,154.0,212,0
210207,an overview of canadian insolvency statistics ...,un survol des statistiques sur l'insolvabilité...,76.0,91,0


In [12]:
df['fr_in_en'].value_counts().compute()

0    22077406
1      428460
Name: fr_in_en, dtype: int64

In [13]:
sample_df = df[df['fr_in_en']==1]
sample_df.head()

Unnamed: 0,en,fr,en_len,fr_len,fr_in_en
132517,the us made a comeback at the top résa tradesh...,les usa reviennt en force au salon top résa de...,82.0,75,1
101430,file # title organization 3340-r32 (300944) cr...,no de dossier titre nom de l'organisation 3340...,284.0,328,1
192999,the russian government appreciated the februar...,le gouvernement russe a apprécié le discours d...,202.0,230,1
131691,- tourisme montréal is doing a small sales mis...,quatre partenaires régionaux y ont participé e...,76.0,83,1
48809,"rubén acevedo trejo, general manager","rubén acevedo trejo, directeur général",36.0,38,1


In [14]:
df = df[df['fr_in_en']==0]
df.head()

Unnamed: 0,en,fr,en_len,fr_len,fr_in_en
198611,the unique element about this work is the very...,l’élément unique de l’ouvrage est la nature mê...,73.0,67,0
52372,this summary highlights agri-food news trends ...,le présent sommaire fait ressortir les tendanc...,170.0,211,0
118054,"in the same period there were 680,000 loansome...",on rapporte 680 000 demandes loansome doc au c...,60.0,70,0
18034,• coffee bars and shops increased in numbers t...,• le nombre de cafés-bars et de cafés-restaura...,154.0,212,0
210207,an overview of canadian insolvency statistics ...,un survol des statistiques sur l'insolvabilité...,76.0,91,0


In [15]:
#final_len = len(df)
#22077406

In [16]:
df = df.drop(columns=['en_len','fr_len','fr_in_en'])
df.head()

Unnamed: 0,en,fr
198611,the unique element about this work is the very...,l’élément unique de l’ouvrage est la nature mê...
52372,this summary highlights agri-food news trends ...,le présent sommaire fait ressortir les tendanc...
118054,"in the same period there were 680,000 loansome...",on rapporte 680 000 demandes loansome doc au c...
18034,• coffee bars and shops increased in numbers t...,• le nombre de cafés-bars et de cafés-restaura...
210207,an overview of canadian insolvency statistics ...,un survol des statistiques sur l'insolvabilité...


In [17]:
train_df,test_df = df.random_split([0.8,0.2])

In [18]:
#len(train_df),len(test_df)
#(17659445, 4417961)

In [19]:
train_df.head()

Unnamed: 0,en,fr
198611,the unique element about this work is the very...,l’élément unique de l’ouvrage est la nature mê...
52372,this summary highlights agri-food news trends ...,le présent sommaire fait ressortir les tendanc...
118054,"in the same period there were 680,000 loansome...",on rapporte 680 000 demandes loansome doc au c...
18034,• coffee bars and shops increased in numbers t...,• le nombre de cafés-bars et de cafés-restaura...
210207,an overview of canadian insolvency statistics ...,un survol des statistiques sur l'insolvabilité...


In [21]:
test_df.head()

Unnamed: 0,en,fr
206431,• to undertake r&d that either provides direct...,• réaliser des travaux de r-d qui appuient dir...
135558,ytd 2003 vs. 2002 occupied room nights -5.1% o...,"chambres-nuits, aacj 2003 par rapport à 2002 c..."
42262,• briefing session:,• session d'information:
201471,the senate banking committee's study of bank m...,"par exemple, l’étude sur les fusions des banqu..."
135542,booking pace (reserved room nights 000's),cadence des réservations (chambres-nuits réser...


In [22]:
reduced_train_df,left_out_df = train_df.random_split([0.01,0.99]) #for easy computation; 1764707 for 1%

In [23]:
len(reduced_train_df)

177014

In [24]:
reduced_train_df.head()

Unnamed: 0,en,fr
62537,"companies such as northwest kinetics (tacoma, ...",des entreprises tells que northwest kinetics (...
35132,accessed from www.infoexport.gc.ca/vtc/display...,"the eu's food and drink industry 2005 », 2005."
98330,"subsequently, in february 2002, the civil and ...","subséquemment, en février 2002, la section du ..."
192850,iraq the minister hoped that the current crisi...,irak le ministre a dit souhaiter le règlement ...
181396,it was attracted in part by the risk mitigatio...,"si nous avons pu recevoir cet argent c'est, en..."


In [27]:
def convert_to_byte_str(sent):
    return sent.encode('utf-8')

reduced_train_df['en'] = reduced_train_df['en'].map(convert_to_byte_str)

In [28]:
reduced_train_df['fr'] = reduced_train_df['fr'].map(convert_to_byte_str)

In [29]:
reduced_train_df.head()

Unnamed: 0,en,fr
62537,b'companies such as northwest kinetics (tacoma...,"b""des entreprises tells que northwest kinetics..."
35132,b'accessed from www.infoexport.gc.ca/vtc/displ...,"b""the eu's food and drink industry 2005 \xc2\x..."
98330,"b'subsequently, in february 2002, the civil an...","b'subs\xc3\xa9quemment, en f\xc3\xa9vrier 2002..."
192850,b'iraq the minister hoped that the current cri...,b'irak le ministre a dit souhaiter le r\xc3\xa...
181396,b'it was attracted in part by the risk mitigat...,"b""si nous avons pu recevoir cet argent c'est, ..."


In [30]:
eng_sents = reduced_train_df['en'].to_dask_array(lengths=True)

In [31]:
eng_sents = eng_sents.compute()

In [32]:
eng_sents[2]

b'subsequently, in february 2002, the civil and comparative law section became part of the newly created bijuralism and drafting support services group in the legislative services directorate.'

In [33]:
fr_sents = reduced_train_df['fr'].to_dask_array(lengths=True)

In [34]:
fr_sents = fr_sents.compute()

In [35]:
fr_sents[2]

b'subs\xc3\xa9quemment, en f\xc3\xa9vrier 2002, la section du droit civil et du droit compar\xc3\xa9 s\xe2\x80\x99est int\xc3\xa9gr\xc3\xa9e au groupe du bijuridisme et des services d\xe2\x80\x99appui \xc3\xa0 la r\xc3\xa9daction nouvellement cr\xc3\xa9\xc3\xa9 au sein de la direction des services l\xc3\xa9gislatifs.'

In [37]:
type(fr_sents[2]),type(eng_sents[2])

(bytes, bytes)

In [41]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        (en_sent.decode().numpy() for en_sent in eng_sents),target_vocab_size=2**13)


tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        (fr_sent.numpy() for fr_sent in fr_sents),target_vocab_size=2**13)


AttributeError: 'str' object has no attribute 'numpy'

In [None]:
sample_string = 'just a sample'
tokenized_string = tokenizer_en.encode(sample_string)
print(f'Tokenized str is {tokenized_string}')

original_string = tokenizer_en.decode(tokenized_string)
print(f'Original string is {original_string}')

assert original_string == sample_string

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
def encode(lang1,lang2):
    #lang1 = en, lang2 = fr
    lang1 = [tokenizer_en.vocab_size]+tokenizer_en.encode(lang1.numpy())+[tokenizer_en.vocab_size+1]
    lang2 = [tokenizer_fr.vocab_size]+tokenizer_fr.encode(lang2.numpy())+[tokenizer_fr.vocab_size+1]
    
    return lang1,lang2

In [None]:
def tf_encode(en,fr):
    result_en,result_fr = tf.py_function(encode,[en,fr],[tf.int64,tf.int64])
    result_en.set_shape([None])
    result_fr.set_shape([None])
    
    return result_en,result_fr

In [41]:
test['tlen'] = test['transcript'].str.len()
test.head()

Unnamed: 0,wav_filename,wav_filesize,transcript,tlen
0,audios/4da6b70e-0108-4f75-80ae-3d71f1dd2c2b.wav,219064,y aquí en dos palotadas hemos encontrado robus...,107
1,audios/8c2ab30b-0fd4-41c3-9724-3b15f2ee2c27.wav,271910,cuando los consejeros escucharon aquello queda...,127
2,audios/ca73c951-c62a-41fe-a953-9871514151f2.wav,64520,su mujer con la cara entre las manos,36
3,audios/067c4606-777b-4fb2-bc6f-8185fbec9016.wav,84222,y otros que se podían echar a la oreja de un toro,49
4,audios/49a08f90-3fc0-43ad-bd5e-a6b671cafdd4.wav,77316,al oír mis pasos alzó la cabeza,31


In [43]:
tarr = test['transcript'].to_dask_array(lengths=True)

In [30]:
tarr[1]

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Count,7 Tasks,1 Chunks
Type,object,numpy.ndarray
Array Chunk Bytes 8 B 8 B Shape () () Count 7 Tasks 1 Chunks Type object numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Count,7 Tasks,1 Chunks
Type,object,numpy.ndarray


In [32]:
tarr[1].compute().split(' ')

['cuando',
 'los',
 'consejeros',
 'escucharon',
 'aquello',
 'quedaron',
 'estremecidos',
 'y',
 'se',
 'dijeron',
 'dios',
 'ha',
 'prohibido',
 'que',
 'padres',
 'se',
 'casen',
 'con',
 'sus',
 'hijas']

In [42]:
def check_en(sent):
    if 'en' in sent:
        return 1
    return 0

test['if_en'] = test['transcript'].map(check_en)
test.head()

Unnamed: 0,wav_filename,wav_filesize,transcript,tlen,if_en
0,audios/4da6b70e-0108-4f75-80ae-3d71f1dd2c2b.wav,219064,y aquí en dos palotadas hemos encontrado robus...,107,1
1,audios/8c2ab30b-0fd4-41c3-9724-3b15f2ee2c27.wav,271910,cuando los consejeros escucharon aquello queda...,127,1
2,audios/ca73c951-c62a-41fe-a953-9871514151f2.wav,64520,su mujer con la cara entre las manos,36,1
3,audios/067c4606-777b-4fb2-bc6f-8185fbec9016.wav,84222,y otros que se podían echar a la oreja de un toro,49,0
4,audios/49a08f90-3fc0-43ad-bd5e-a6b671cafdd4.wav,77316,al oír mis pasos alzó la cabeza,31,0


In [52]:
fta = tarr.compute()

In [45]:
len(tarr)

112845

In [47]:
test['if_en'].value_counts().compute()

1    67898
0    44947
Name: if_en, dtype: int64

In [26]:
df['en_len'] = df['en_len'].astype('int32')

In [27]:
df.head()

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [9]:
df_list = df.to_dask_array(lengths=True)

In [11]:
df_list[:2].compute()

array([['The unique element about this work is the very nature of its composition.',
        'L’élément unique de l’ouvrage est la nature même de sa composition.'],
       ['This summary highlights agri-food news trends on a monthly basis, providing an analysis and overview of agri-food issues that were of particular interest to the industry.',
        'Le présent sommaire fait ressortir les tendances dans les nouvelles sur l’agroalimentaire du mois et fournit une analyse et un survol des dossiers agroalimentaires qui ont particulièrement intéressé l’industrie.']],
      dtype=object)

In [14]:
df_list[1][0].compute(),df_list[1][1].compute()

('This summary highlights agri-food news trends on a monthly basis, providing an analysis and overview of agri-food issues that were of particular interest to the industry.',
 'Le présent sommaire fait ressortir les tendances dans les nouvelles sur l’agroalimentaire du mois et fournit une analyse et un survol des dossiers agroalimentaires qui ont particulièrement intéressé l’industrie.')

KeyboardInterrupt: 

In [20]:
def clean_t

'In the same period there were 680,000 Loansome Doc requests.'