# Machine Translation



# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [0]:
import string 
from string import digits
import re 
from sklearn.utils import shuffle
import pandas as pd 
import matplotlib.pyplot as plt 
% matplotlib inline 
pd.set_option('display.max_colwidth', 200)

# 2)- Loading data

We have data from 2009 to 2016.

In [0]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [0]:
# split a loaded document into sentences
def to_sentences(doc):
	return doc.strip().split('\n')

In [0]:
# shortest and longest sentence lengths
def sentence_lengths(sentences):
	lengths = [len(s.split()) for s in sentences]
	return min(lengths), max(lengths)

### 2.1)- For year 2009

In [7]:
# load English data
filename = 'newstest2009.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2009.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2525, min=1, max=108
German data: sentences=2525, min=1, max=110


It is important to notice that sentence length is same. So, we have a balanced data.

We shall check this on all years

### 2.2)-For 2010

In [8]:
# load English data
filename = 'newstest2010.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2010.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2489, min=1, max=74
German data: sentences=2489, min=1, max=86


### 2.3)-For year 2011

In [9]:
# load English data
filename = 'newstest2011.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2011.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=93
German data: sentences=3003, min=1, max=92


### 2.4)-For year 2012

In [10]:
# load English data
filename = 'newstest2012.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2012.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=114
German data: sentences=3003, min=1, max=101


### 2.5)-For year 2013

In [11]:
# load English data
filename = 'newstest2013.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2013.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3000, min=1, max=82
German data: sentences=3000, min=1, max=85


### 2.6)-For year 2014

In [12]:
# load English data
filename = 'newstest2014.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2014.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=3003, min=1, max=68
German data: sentences=3003, min=1, max=64


### 2.7)- For year 2015

In [13]:
# load English data
filename = 'newstest2015.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2015.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2169, min=1, max=71
German data: sentences=2169, min=1, max=72


### For year 2016

In [14]:
# load English data
filename = 'newstest2016.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

# load French data
filename = 'newstest2016.de'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('German data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

English data: sentences=2999, min=1, max=83
German data: sentences=2999, min=1, max=88


**All datasets have balanced sentences for English and German version**

# 3)- Data Cleaning

- Tokenizing text by white space.
- Normalizing case to lowercase.
- Removing punctuation from each word.
- Removing non-printable characters.
- Removing words that contain non-alphabetic characters.