### STEP 1. Combine TED Talks raw files into 2 files: TED English, TED German

In [None]:
# connect Google Drive to access files

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# navigate to the directory

%cd /content/gdrive/MyDrive/TFM
! ls

In [4]:
# raw files paths

ted1_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/000_779.en.txt'
ted2_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/001_769.en.txt'
ted3_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/002_792.en.txt'
ted4_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/003_799.en.txt'
ted5_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/004_767.en.txt'
ted6_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/005_790.en.txt'
ted7_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/006_785.en.txt'
ted8_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/007_783.en.txt'
ted9_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/008_824.en.txt'
ted10_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/009_805.en.txt'
ted11_en = '/content/gdrive/MyDrive/TFM/raw_ted_files/010_837.en.txt'

ted1_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/000_779.de.txt'
ted2_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/001_769.de.txt'
ted3_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/002_792.de.txt'
ted4_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/003_799.de.txt'
ted5_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/004_767.de.txt'
ted6_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/005_790.de.txt'
ted7_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/006_785.de.txt'
ted8_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/007_783.de.txt'
ted9_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/008_824.de.txt'
ted10_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/009_805.de.txt'
ted11_de = '/content/gdrive/MyDrive/TFM/raw_ted_files/010_837.de.txt'

In [5]:
# combine raw files

tedfiles_en = [ted1_en, ted2_en, ted3_en, ted4_en, ted5_en, ted6_en, ted7_en, ted8_en, ted9_en, ted10_en, ted11_en]
tedfiles_de = [ted1_de, ted2_de, ted3_de, ted4_de, ted5_de, ted6_de, ted7_de, ted8_de, ted9_de, ted10_de, ted11_de]


def combine_corpus (list_of_files):
  """receive a list of file paths, read content of each file, combine into a single file"""
  whole_corpus = []
  for file in list_of_files:
    with open(file, "r") as f:
      content = [line.rstrip() for line in f]
    whole_corpus.extend(content)
  return whole_corpus

tedcorp_en = combine_corpus (tedfiles_en)
tedcorp_de = combine_corpus (tedfiles_de)

In [7]:
# save to .txt files

TED_en = '/content/gdrive/MyDrive/TFM/ted_en.txt'
TED_de = '/content/gdrive/MyDrive/TFM/ted_de.txt'

corpora = [tedcorp_en, tedcorp_de]
outfiles = [TED_en, TED_de]

def save_corpora_to_files (corpora, files):
  """receive corpora and output files paths, write corpora to files"""
  for i in range (2):
    with open(files[i], 'w') as f_out:
      for sentence in corpora[i]:
        f_out.write("{}\n".format(sentence))

save_corpora_to_files (corpora, outfiles)

### STEP 2. Parsing with spaCy

In [None]:
!python -m spacy download en_core_web_lg

In [2]:
# Restart runtime to load the model!!!
# Runtime -> Restart runtime

import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_lg')

In [5]:
# explore difference in parsing, preposition vs particle

sent_prep = nlp("John is going on a trip.")

print("{:10}{:10}{:10}{:10}{:10}{:10}".format("Token", "Lemma", "Tag", "POS", "Dependency", "Parent"))
for token in sent_prep:
  print("{:10}{:10}{:10}{:10}{:10}{:10}".format(token.text, token.lemma_, token.tag_, token.pos_, token.dep_, token.head.text))

displacy.render(sent_prep,
                style="dep",
                options={"collapse_punct": False, "distance": 110, "fine_grained": True},
                jupyter=True)

Token     Lemma     Tag       POS       DependencyParent    
John      John      NNP       PROPN     nsubj     going     
is        be        VBZ       AUX       aux       going     
going     go        VBG       VERB      ROOT      going     
on        on        IN        ADP       prep      going     
a         a         DT        DET       det       trip      
trip      trip      NN        NOUN      pobj      on        
.         .         .         PUNCT     punct     going     


In [6]:
sent_part = nlp("What is going on here?")

print("{:10}{:10}{:10}{:10}{:10}{:10}".format("Token", "Lemma", "Tag", "POS", "Dependency", "Parent"))
for token in sent_part:
  print("{:10}{:10}{:10}{:10}{:10}{:10}".format(token.text, token.lemma_, token.tag_, token.pos_, token.dep_, token.head.text))

displacy.render(sent_part,
                style="dep",
                options={"collapse_punct": False, "distance": 110, "fine_grained": True},
                jupyter=True)

Token     Lemma     Tag       POS       DependencyParent    
What      what      WP        PRON      nsubj     going     
is        be        VBZ       AUX       aux       going     
going     go        VBG       VERB      ROOT      going     
on        on        RP        ADP       prt       going     
here      here      RB        ADV       advmod    going     
?         ?         .         PUNCT     punct     going     


### STEP 3. Get Subcorpus -> Phrasal Verbs

In [None]:
# load corpus files

tedfile_en = '/content/gdrive/MyDrive/TFM/ted_en.txt'
tedfile_de = '/content/gdrive/MyDrive/TFM/ted_de.txt'

def read_file (file):
  with open(file, 'r') as f:
    content = [line.rstrip() for line in f]
  return content

ted_en = read_file(tedfile_en)
ted_de = read_file(tedfile_de)

In [None]:
# build subcorpus

def get_phrasalverbs_subcorpus (corpus):
  """Receive a corpus of sentences, return a subcorpus of sentences with phrasal verbs and the list of indices"""
  
  target_sentences = []
  target_indices = []
  
  for i in range (len(corpus)):
    sentence = nlp(corpus[i])
    for token in sentence:
      if token.tag_ == 'RP' and token.dep_ == "prt" and token.head.pos_ == "VERB":
        target_sentences.append(corpus[i])
        target_indices.append(i)
  
  return target_sentences, target_indices


# find English sentences with phrasal verbs and their indices
ted_en_subcorpus, ted_subcorpus_indices = get_phrasalverbs_subcorpus(ted_en)

# retrieve corresponding German sentences by indices
ted_de_subcorpus = []
for i in ted_subcorpus_indices:
  ted_de_subcorpus.append(ted_de[i])

In [13]:
# save results to pandas dataframe

import pandas as pd

teddata = pd.DataFrame(list(zip(ted_subcorpus_indices, ted_en_subcorpus, ted_de_subcorpus)),
                      columns =['Index', 'Original EN Sentence', 'DE Counterpart Sentence'])

teddata.to_excel(r'/content/gdrive/MyDrive/TFM/tedsubcorpus.xlsx')

### STEP 4. Translate DE to EN with transformer.wmt19.de-en

In [1]:
# packages

import torch
import pandas as pd
! pip install fairseq fastBPE sacremoses

In [None]:
# load the model

de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                       tokenizer='moses', bpe='fastbpe')

In [5]:
# load the data

teddata = pd.read_excel('/content/gdrive/MyDrive/TFM/tedsubcorpus.xlsx', index_col=0)
teddata.head()

Unnamed: 0,Index,Original EN Sentence,DE Counterpart Sentence
0,1,I had somebody count the number of books with ...,Ich habe einige Leute die Anzahl der Bücher zä...
1,7,This applies to laypeople thinking about their...,"Das gilt für Laien , die über ihr eigenes Glüc..."
2,7,This applies to laypeople thinking about their...,"Das gilt für Laien , die über ihr eigenes Glüc..."
3,9,It turns out that the word happiness is just n...,"Es stellt sich heraus , dass das Wort Glück ei..."
4,10,I think there is one particular meaning to whi...,"Ich denke , es gibt eine bestimmte Bedeutung ,..."


In [None]:
# translate from DE to EN, add to the dataframe, save to an excel file

ted_de = teddata['DE Counterpart Sentence'].tolist()
ted_de2en = de2en.translate(ted_de)
teddata['System Generated DE -> EN Sentence'] = ted_de2en

teddata.to_excel(r'/content/gdrive/MyDrive/TFM/tedtranslated.xlsx')