# Language Translation Using Recurring Neural Networks

### Data Loading 

In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
for dirname, _, filenames in os.walk('/Capstone/Data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Capstone/Data\EN-DE\EN-DE.txt
/Capstone/Data\EN-ES\EN-ES.txt


In [3]:
CD = "/Capstone/Data/"

In [4]:
SL = 'EN'

In [5]:
TL_DE = 'DE'

In [6]:
TL_ES = 'ES'

In [7]:
EN_DE = pd.read_csv(CD+SL+'-'+TL_DE+'/'+SL+'-'+TL_DE+'.txt', sep='\t', header = None)[[0,1]].rename(columns = {0:SL, 1:TL_DE})

In [8]:
EN_ES = pd.read_csv(CD+SL+'-'+TL_ES+'/'+SL+'-'+TL_ES+'.txt', sep='\t', header = None)[[0,1]].rename(columns = {0:SL, 1:TL_ES})

In [9]:
EN_DE.head()

Unnamed: 0,EN,DE
0,Commission Regulation (EC) No 1788/2004,Verordnung (EG) Nr. 1788/2004 der Kommission
1,of 15 October 2004,vom 15. Oktober 2004
2,fixing the minimum selling prices for butter f...,zur Festsetzung der Mindestverkaufspreise für ...
3,"THE COMMISSION OF THE EUROPEAN COMMUNITIES,",DIE KOMMISSION DER EUROPÄISCHEN GEMEINSCHAFTEN —
4,Having regard to the Treaty establishing the E...,gestützt auf den Vertrag zur Gründung der Euro...


In [10]:
EN_ES.head()

Unnamed: 0,EN,ES
0,Commission Regulation (EC) No 1788/2004,Reglamento (CE) no 1788/2004 de la Comisión
1,of 15 October 2004,de 15 de octubre de 2004
2,fixing the minimum selling prices for butter f...,por el que se fijan los precios mínimos de ven...
3,"THE COMMISSION OF THE EUROPEAN COMMUNITIES,","LA COMISIÓN DE LAS COMUNIDADES EUROPEAS,"
4,Having regard to the Treaty establishing the E...,Visto el Tratado constitutivo de la Comunidad ...


In [11]:
EN_DE.shape

(5693624, 2)

In [12]:
EN_ES.shape

(5696850, 2)

In [13]:
EN_DE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5693624 entries, 0 to 5693623
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   EN      object
 1   DE      object
dtypes: object(2)
memory usage: 86.9+ MB


In [14]:
EN_ES.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5696850 entries, 0 to 5696849
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   EN      object
 1   ES      object
dtypes: object(2)
memory usage: 86.9+ MB


### Data Cleaning

In [15]:
missing_DE = pd.concat([EN_DE.isnull().sum(), 100 * EN_DE.isnull().mean()], axis=1)
missing_DE.columns=['count', '%']
missing_DE.sort_values(by= ['count','%'], ascending=False)

Unnamed: 0,count,%
EN,134,0.002354
DE,4,7e-05


In [16]:
missing_ES = pd.concat([EN_ES.isnull().sum(), 100 * EN_ES.isnull().mean()], axis=1)
missing_ES.columns=['count', '%']
missing_ES.sort_values(by= ['count','%'], ascending=False)

Unnamed: 0,count,%
EN,130,0.002282
ES,2,3.5e-05


In [17]:
EN_DE.dropna(inplace = True)

In [18]:
EN_DE.isna().sum()

EN    0
DE    0
dtype: int64

In [19]:
EN_DE.shape

(5693487, 2)

In [20]:
EN_ES.dropna(inplace = True)

In [21]:
EN_ES.isna().sum()

EN    0
ES    0
dtype: int64

In [33]:
EN_DE_sample = pd.DataFrame(EN_DE.loc[0:50000, :])
EN_ES_sample = pd.DataFrame(EN_ES.loc[0:50000, :])

In [26]:
EN_DE_sample.dtypes

EN    object
DE    object
dtype: object

In [34]:
langs = ['EN', 'DE']

for i in langs:
    EN_DE_sample[i] = EN_DE_sample[i].str.lower().str.split()

In [35]:
langs = ['EN', 'ES']

for i in langs:
    EN_ES_sample[i] = EN_ES_sample[i].str.lower().str.split()

### Data Enriching 

In [30]:
from collections import Counter

In [36]:
EN_DE_sample['ENCounter'] = EN_DE_sample['EN'].apply(set).apply(len)
EN_DE_sample['DECounter'] = EN_DE_sample['DE'].apply(set).apply(len)
EN_DE_sample['CountDiff'] = pd.Series.abs(EN_DE_sample['ENCounter'] - EN_DE_sample['DECounter'])

In [37]:
EN_DE_sample.head()

Unnamed: 0,EN,DE,ENCounter,DECounter,CountDiff
0,"[commission, regulation, (ec), no, 1788/2004]","[verordnung, (eg), nr., 1788/2004, der, kommis...",5,6,1
1,"[of, 15, october, 2004]","[vom, 15., oktober, 2004]",4,4,0
2,"[fixing, the, minimum, selling, prices, for, b...","[zur, festsetzung, der, mindestverkaufspreise,...",20,17,3
3,"[the, commission, of, the, european, communiti...","[die, kommission, der, europäischen, gemeinsch...",5,6,1
4,"[having, regard, to, the, treaty, establishing...","[gestützt, auf, den, vertrag, zur, gründung, d...",8,9,1


In [38]:
EN_ES_sample['ENCounter'] = EN_ES_sample['EN'].apply(set).apply(len)
EN_ES_sample['ESCounter'] = EN_ES_sample['ES'].apply(set).apply(len)
EN_ES_sample['CountDiff'] = pd.Series.abs(EN_ES_sample['ENCounter'] - EN_ES_sample['ESCounter'])

In [40]:
EN_ES_sample.head()

Unnamed: 0,EN,ES,ENCounter,ESCounter,CountDiff
0,"[commission, regulation, (ec), no, 1788/2004]","[reglamento, (ce), no, 1788/2004, de, la, comi...",5,7,2
1,"[of, 15, october, 2004]","[de, 15, de, octubre, de, 2004]",4,4,0
2,"[fixing, the, minimum, selling, prices, for, b...","[por, el, que, se, fijan, los, precios, mínimo...",20,25,5
3,"[the, commission, of, the, european, communiti...","[la, comisión, de, las, comunidades, europeas,]",5,6,1
4,"[having, regard, to, the, treaty, establishing...","[visto, el, tratado, constitutivo, de, la, com...",8,8,0
