In [16]:
%time
# !curl -L -s --compressed https://www.statmt.org/europarl/v7/fr-en.tgz
!curl -L -s --compressed https://www.statmt.org/europarl/v7/fr-en.tgz | tar -xzvf -


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs
europarl-v7.fr-en.en
europarl-v7.fr-en.fr


In [18]:
import os
files = os.listdir('/content')
files

['.config', 'europarl-v7.fr-en.en', 'europarl-v7.fr-en.fr', 'sample_data']

In [19]:
import pickle
from pickle import dump

In [20]:
def load(file_name):
  file = open(file_name, mode='rt',encoding='utf-8')
  t = file.read()
  file.close()
  return t

In [21]:
def sentencize(doc):
  return doc.strip().split('\n')

In [22]:
def shortest_longest_length(sentences):
  lengths = [len(s.split()) for s in sentences]
  return min(lengths),max(lengths)

In [23]:
import re
import string
import unicodedata

def clean_lines(lines):
  cleaned_list= list()
  re_print = re.compile('[^%s]'% re.escape(string.printable))

  table = str.maketrans('','',string.punctuation)

  for line in lines:
    line = unicodedata.normalize('NFD',line).encode('ascii','ignore')
    line = line.decode('UTF-8').split()
    line = [word.lower() for word in line]
    line = [word.translate(table) for word in line]
    line= [re_print.sub('',w) for w in line]

    line = [word for word in line if word.isalpha()]
    cleaned_list.append(' '.join(line))
  return cleaned_list

In [36]:
def process(lang,file_name):
  doc = load(file_name)
  sents = sentencize(doc)
  minlen, maxlen = shortest_longest_length(sents)
  print(lang,": sentences = %d, min=%d, max=%d"% (len(sents),minlen,maxlen))
  clean = clean_lines(sents)
  saving_file = lang+'.pkl'
  out_file = open(saving_file,'wb')
  pickle.dump(clean,out_file)
  out_file.close()
  print(saving_file, " saved")

In [37]:
process('English','europarl-v7.fr-en.en')

English : sentences = 2007723, min=0, max=668
English.pkl  saved


In [38]:
process('French','europarl-v7.fr-en.fr')

French : sentences = 2007723, min=0, max=693
French.pkl  saved


In [41]:
from pickle import load
from pickle import dump
from collections import Counter

def load_clean_sents(file_name):
  return load(open(file_name,'rb'))

def save_clean_sents(sents,file_name):
  dump(sents, open(file_name,'wb'))
  print('Saved: %s' % (file_name))

def vocab_table(lines):
  vocab = Counter()
  for line in lines:
    tokens = line.split()
    vocab.update(tokens)
  return vocab

def trim_vocab(vocab, min_occurence): #Threshold
  tokens =[ k for k, c in vocab.items() if c>= min_occurence]
  return set(tokens)

def out_vocab(lines, vocab):
  new_lines = list()
  for line in lines:
    new_tokens = list()
    for token in line.split():
      if token in vocab:
        new_tokens.append(token)
      else:
        new_tokens.append('unk')
    new_line = ' '.join(new_tokens)
    new_lines.append(new_line)
  return new_lines

In [44]:
file_name = 'English.pkl'
lines = load_clean_sents(file_name)
vocab = vocab_table(lines)
print("EN Vocab: %d"%len(vocab))
vocab = trim_vocab(vocab,5)
print("Trimmed EN Vocab: %d"%len(vocab))
lines = out_vocab(lines,vocab)

file_name = 'en_vocab.pkl'
save_clean_sents(lines,file_name)

for i in range(20):
  print("line",i,":",lines[i])

EN Vocab: 105357
Trimmed EN Vocab: 41746
Saved: en_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observed a minute s silence
line 7 : madam president on a point of order
li

In [46]:
file_name = 'French.pkl'
lines = load_clean_sents(file_name)
vocab = vocab_table(lines)
print("FR Vocab: %d"%len(vocab))
vocab = trim_vocab(vocab,5)
print("Trimmed FR Vocab: %d"%len(vocab))
lines = out_vocab(lines,vocab)

file_name = 'fr_vocab.pkl'
save_clean_sents(lines,file_name)

for i in range(20):
  print("line",i,":",lines[i])

FR Vocab: 141642
Trimmed FR Vocab: 58800
Saved: fr_vocab.pkl
line 0 : reprise de la session
line 1 : je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances
line 2 : comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles
line 3 : vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session
line 4 : en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches
line 5 : je vous invite a vous lever pour cette minute de silence
line 6 : le parlement debout observe une minute de silence
line 7 : m