# Machine Translation



# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [0]:
import string 
from string import digits
from collections import Counter
import re 
from sklearn.utils import shuffle
import pandas as pd 
from pickle import dump
from pickle import load
from unicodedata import normalize
import matplotlib.pyplot as plt 
% matplotlib inline 
pd.set_option('display.max_colwidth', 200)

# 2)- Loading data

We have data from 2009 to 2016.

In [0]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [0]:
# split a loaded document into sentences
def to_sentences(doc):
	return doc.strip().split('\n')

In [0]:
# shortest and longest sentence lengths
def sentence_lengths(sentences):
	lengths = [len(s.split()) for s in sentences]
	return min(lengths), max(lengths)

### 2.1)- For year 2009

In [6]:
# load English data
filename = 'newstest2009.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2009.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2525, min=1, max=108
German data: sentences=2525, min=1, max=110


It is important to notice that sentence length is same. So, we have a balanced data.

We shall check this on all years

### 2.2)-For 2010

In [7]:
# load English data
filename = 'newstest2010.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2010.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2489, min=1, max=74
German data: sentences=2489, min=1, max=86


### 2.3)-For year 2011

In [8]:
# load English data
filename = 'newstest2011.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2011.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=93
German data: sentences=3003, min=1, max=92


### 2.4)-For year 2012

In [9]:
# load English data
filename = 'newstest2012.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2012.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=114
German data: sentences=3003, min=1, max=101


### 2.5)-For year 2013

In [10]:
# load English data
filename = 'newstest2013.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2013.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3000, min=1, max=82
German data: sentences=3000, min=1, max=85


### 2.6)-For year 2014

In [11]:
# load English data
filename = 'newstest2014.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2014.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=68
German data: sentences=3003, min=1, max=64


### 2.7)- For year 2015

In [12]:
# load English data
filename = 'newstest2015.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2015.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2169, min=1, max=71
German data: sentences=2169, min=1, max=72


### For year 2016

In [13]:
# load English data
filename = 'newstest2016.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2016.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2999, min=1, max=83
German data: sentences=2999, min=1, max=88


**All datasets have balanced sentences for English and German version**

# 3)- Data Cleaning

- Tokenizing text by white space.
- Normalizing case to lowercase.
- Removing punctuation from each word.
- Removing non-printable characters.
- Removing words that contain non-alphabetic characters.


**We shall use only one file(2015) for quick processing.**

In [0]:
# create cleaning function

def clean_lines(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# normalize unicode characters
		line = normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [word.translate(table) for word in line]
		# remove non-printable chars form each token
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	return cleaned

In [0]:
# save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [16]:
# load English data
filename = 'newstest2015.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'english2015.pkl')

Saved: english2015.pkl


In [17]:
# spot check for english version
for i in range(10):
	print(sentences[i])

india and japan prime ministers meet in tokyo
indias new prime minister narendra modi is meeting his japanese counterpart shinzo abe in tokyo to discuss economic and security ties on his first major foreign visit since winning mays election
mr modi is on a fiveday trip to japan to strengthen economic ties with the third largest economy in the world
high on the agenda are plans for greater nuclear cooperation
india is also reportedly hoping for a deal on defence collaboration between the two nations
karratha police arrest after high speed motorcycle chase
a motorcycle has been seized after it was ridden at in a zone and through bushland to escape police in the pilbara
traffic police on patrol in karratha this morning tried to pull over a blue motorcycle when they spotted it reaching as it pulled out of a service station on bathgate road
police say the rider then failed to stop and continued on to burgess road before turning into bushland causing the officers to lose sight of it
the moto

In [18]:
# load and check German data

filename = 'newstest2015.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
sentences = clean_lines(sentences)
save_clean_sentences(sentences, 'german2015.pkl')
# spot check for german version
for i in range(10):
	print(sentences[i])

Saved: german2015.pkl
die premierminister indiens und japans trafen sich in tokio
indiens neuer premierminister narendra modi trifft bei seinem ersten wichtigen auslandsbesuch seit seinem wahlsieg im mai seinen japanischen amtskollegen shinzo abe in toko um wirtschaftliche und sicherheitspolitische beziehungen zu besprechen
herr modi befindet sich auf einer funftagigen reise nach japan um die wirtschaftlichen beziehungen mit der drittgroten wirtschaftsnation der welt zu festigen
plane fur eine starkere kerntechnische zusammenarbeit stehen ganz oben auf der tagesordnung
berichten zufolge hofft indien daruber hinaus auf einen vertrag zur verteidigungszusammenarbeit zwischen den beiden nationen
polizei von karratha verhaftet nach schneller motorradjagd
ein motorrad wurde beschlagnahmt nachdem der fahrer es mit kmh in einer kmhzone und durch buschland gefahren hatte um der polizei in bilbara zu entkommen
verkehrspolizisten in karratha versuchten heute morgen ein blaues motorrad zu stoppen 

# 4)- Checking Vocabulary

In [0]:
# function to load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [0]:
# function to save a list of clean sentences to file
def save_clean_sentences(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

### 4.1)-frequency table
create a frequency table for all words

In [0]:
def to_vocab(lines):
	vocab = Counter()
	for line in lines:
		tokens = line.split()
		vocab.update(tokens)
	return vocab

### 4.2)- Trim Vocab

Process the created vocabulary and remove all words from the Counter that have an occurrence below a specific threshold using *out of vocabulary (OOV)* method.


https://medium.com/@shabeelkandi/handling-out-of-vocabulary-words-in-natural-language-processing-based-on-context-4bbba16214d5

In [0]:
def trim_vocab(vocab, min_occurance):
	tokens = [k for k,c in vocab.items() if c >= min_occurance]
	return set(tokens)

### 4.3)- Update dataset

Update the sentences, remove all words not in the trimmed vocabulary and mark their removal with a special token, in this case, the string “unk“.

In [0]:
def update_dataset(lines, vocab):
	new_lines = list()
	for line in lines:
		new_tokens = list()
		for token in line.split():
			if token in vocab:
				new_tokens.append(token)
			else:
				new_tokens.append('unk')
		new_line = ' '.join(new_tokens)
		new_lines.append(new_line)
	return new_lines

### 4.4)- Check English version

In [25]:
# load English dataset i.e pickled earlier
filename = 'english2015.pkl'
lines = load_clean_sentences(filename)

# calculate vocabulary
vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))

# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'english_vocab2015.pkl'
save_clean_sentences(lines, filename)

English Vocabulary: 7227
New English Vocabulary: 1210
Saved: english_vocab2015.pkl


### 4.5)-Check German version

In [26]:
# load English dataset i.e pickled earlier
filename = 'german2015.pkl'
lines = load_clean_sentences(filename)

# calculate vocabulary
vocab = to_vocab(lines)
print('German Vocabulary: %d' % len(vocab))

# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New German Vocabulary: %d' % len(vocab))

# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'german_vocab2015.pkl'
save_clean_sentences(lines, filename)

German Vocabulary: 9280
New German Vocabulary: 988
Saved: german_vocab2015.pkl


These cleaning and vocab reduction methods have given us a more normalized and potent data for model training

# 5)- Combine Files

- We can do this whole process on each file and then use one dataset pair for modeling i.e German 2015, English 2015.

- Or we can combine all these files and then apply these above discussed preprocessing methods on combined data. 

-  We may end up with a huge data so, we need to use random sample of data to train our model.

- Finally, I personally like to work with dataframes and like to see files as form of matrix or vector. So, I will convert these files into dataframes then combine both English and German translation parts in rows and columns. So, they look agreeable to me. 

In [0]:
df_en_2009= pd.read_table('newstest2009.en', sep="\t", names=['eng'])

In [0]:
df_ger_2009= pd.read_table('newstest2009.de', sep="\t", names=['ger'])

In [39]:
df_en_2009.head()

Unnamed: 0,eng
0,Prague Stock Market falls to minus by the end of the trading day
1,"After a sharp drop in the morning, the Prague Stock Market corrected its losses."
2,Transactions with stocks from the Czech Energy Enterprise (ČEZ) reached nearly half of the regular daily trading.
3,"The Prague Stock Market immediately continued its fall from Monday at the beginning of Tuesday's trading, when it dropped by nearly six percent."
4,This time the fall in stocks on Wall Street is responsible for the drop.


In [40]:
df_ger_2009.head()

Unnamed: 0,ger
0,Die Prager Börse stürzt gegen Geschäftsschluss ins Minus.
1,Nach dem steilen Abfall am Morgen konnte die Prager Börse die Verluste korrigieren.
2,Die Transaktionen mit den Aktien von ČEZ erreichten fast die Hälfte des normalen Tagesgeschäfts.
3,"Die Prager Börse knüpfte gleich zu Beginn der Dienstagsgeschäfte an den Einbruch vom Montag an, als sie um weitere sechs Prozentpunkte sank."
4,Diesmal lag der Grund für den Einbruch an der Wall Street.


In [43]:
df_combine_2009=pd.concat([df_en_2009, df_ger_2009], axis=1, join='inner') # to adjust automatically row labels i.e index
df_combine_2009.head()

Unnamed: 0,eng,ger
0,Prague Stock Market falls to minus by the end of the trading day,Die Prager Börse stürzt gegen Geschäftsschluss ins Minus.
1,"After a sharp drop in the morning, the Prague Stock Market corrected its losses.",Nach dem steilen Abfall am Morgen konnte die Prager Börse die Verluste korrigieren.
2,Transactions with stocks from the Czech Energy Enterprise (ČEZ) reached nearly half of the regular daily trading.,Die Transaktionen mit den Aktien von ČEZ erreichten fast die Hälfte des normalen Tagesgeschäfts.
3,"The Prague Stock Market immediately continued its fall from Monday at the beginning of Tuesday's trading, when it dropped by nearly six percent.","Die Prager Börse knüpfte gleich zu Beginn der Dienstagsgeschäfte an den Einbruch vom Montag an, als sie um weitere sechs Prozentpunkte sank."
4,This time the fall in stocks on Wall Street is responsible for the drop.,Diesmal lag der Grund für den Einbruch an der Wall Street.


**Here we have combined one file. Let's us combine all of dataset in form of dataframes with columns (showing german and english lang text) and rows.**

### 5a)- Combine all english version

In [0]:
df_en_2010= pd.read_table('newstest2010.en', sep="\t", names=['eng'])
df_en_2011= pd.read_table('newstest2011.en', sep="\t", names=['eng'])
df_en_2012= pd.read_table('newstest2012.en', sep="\t", names=['eng'])
df_en_2013= pd.read_table('newstest2013.en', sep="\t", names=['eng'])
df_en_2014= pd.read_table('newstest2014.en', sep="\t", names=['eng'])
df_en_2015= pd.read_table('newstest2015.en', sep="\t", names=['eng'])
df_en_2016= pd.read_table('newstest2016.en', sep="\t", names=['eng'])

In [0]:
df_eng_all=pd.concat([df_en_2009,df_en_2010,df_en_2011,df_en_2012,df_en_2013,df_en_2014,df_en_2015,df_en_2016], ignore_index=True)

In [56]:
df_eng_all.head()

Unnamed: 0,eng
0,Prague Stock Market falls to minus by the end of the trading day
1,"After a sharp drop in the morning, the Prague Stock Market corrected its losses."
2,Transactions with stocks from the Czech Energy Enterprise (ČEZ) reached nearly half of the regular daily trading.
3,"The Prague Stock Market immediately continued its fall from Monday at the beginning of Tuesday's trading, when it dropped by nearly six percent."
4,This time the fall in stocks on Wall Street is responsible for the drop.


In [57]:
df_eng_all.shape

(21967, 1)

### 5b)- Combine all German version

In [0]:
df_ger_2010= pd.read_table('newstest2010.de', sep="\t", names=['ger'])
df_ger_2011= pd.read_table('newstest2011.de', sep="\t", names=['ger'])
df_ger_2012= pd.read_table('newstest2012.de', sep="\t", names=['ger'])
df_ger_2013= pd.read_table('newstest2013.de', sep="\t", names=['ger'])
df_ger_2014= pd.read_table('newstest2014.de', sep="\t", names=['ger'])
df_ger_2015= pd.read_table('newstest2015.de', sep="\t", names=['ger'])
df_ger_2016= pd.read_table('newstest2016.de', sep="\t", names=['ger'])

In [0]:
df_ger_all=pd.concat([df_ger_2009,df_ger_2010,df_ger_2011,df_ger_2012,df_ger_2013,df_ger_2014,df_ger_2015,df_ger_2016], ignore_index=True)

In [61]:
df_ger_all.head()

Unnamed: 0,ger
0,Die Prager Börse stürzt gegen Geschäftsschluss ins Minus.
1,Nach dem steilen Abfall am Morgen konnte die Prager Börse die Verluste korrigieren.
2,Die Transaktionen mit den Aktien von ČEZ erreichten fast die Hälfte des normalen Tagesgeschäfts.
3,"Die Prager Börse knüpfte gleich zu Beginn der Dienstagsgeschäfte an den Einbruch vom Montag an, als sie um weitere sechs Prozentpunkte sank."
4,Diesmal lag der Grund für den Einbruch an der Wall Street.


In [60]:
df_ger_all.shape

(21906, 1)

In [64]:
# combine into rows and columns dataframe

df_combine=pd.concat([df_eng_all,df_ger_all], axis=1, join='inner') # to adjust automatically row labels i.e index
df_combine.head(15)

Unnamed: 0,eng,ger
0,Prague Stock Market falls to minus by the end of the trading day,Die Prager Börse stürzt gegen Geschäftsschluss ins Minus.
1,"After a sharp drop in the morning, the Prague Stock Market corrected its losses.",Nach dem steilen Abfall am Morgen konnte die Prager Börse die Verluste korrigieren.
2,Transactions with stocks from the Czech Energy Enterprise (ČEZ) reached nearly half of the regular daily trading.,Die Transaktionen mit den Aktien von ČEZ erreichten fast die Hälfte des normalen Tagesgeschäfts.
3,"The Prague Stock Market immediately continued its fall from Monday at the beginning of Tuesday's trading, when it dropped by nearly six percent.","Die Prager Börse knüpfte gleich zu Beginn der Dienstagsgeschäfte an den Einbruch vom Montag an, als sie um weitere sechs Prozentpunkte sank."
4,This time the fall in stocks on Wall Street is responsible for the drop.,Diesmal lag der Grund für den Einbruch an der Wall Street.
5,"The reaction of the market to the results of the vote in the American House of Representatives, which refused to support the plan for the stabilization of the financial sector there, has manifeste...","Auch in Tschechien zeigt sich so die Reaktion des Marktes auf das Ergebnis der Abstimmung des amerikanischen Repräsentantenhauses, das es abgelehnt hatte, einen Stabilisierungsplan für den dortige..."
6,Stocks fall in Asia,Fall der Aktien in Asien
7,"Stocks in the Asian markets experienced a dramatic drop on Tuesday, even though the indexes ultimately erased a part of the losses during the day.","Einen dramatischen Fall erlebten am Dienstag die Aktien an den Börsen in Asien, wenngleich die Indices im Laufe des Tages einen Teil der Verluste ausgleichen konnten."
8,"The Hang Seng Index of the Hong Kong Stock Exchange wrote off nearly four percent during the day, but later it erased a part of the losses and reduced the decrease to roughly 2.5 percent.","Der Index der Hang Seng Börse in Hongkong verlor im Laufe des Geschäftstages fast vier Prozentpunkte, konnte aber später einen Teil der Verluste wettmachen, somit verringerte sich der Rückgang auf..."
9,"The Hang Seng China Enterprises Index, which follows the movement of Chinese stocks on the stock market in Hong Kong, dropped by 3.8 percent, in Shanghai the markets were closed.","Der Index Hang Seng China Enterprises, der die Bewegung der chinesischen Aktien an der Börse in Hongkong beobachtet, sank um 3,8 Prozent, in Shanghai waren die Märkte geschlossen."
