In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import StanfordTagger
from Data_Import import import_data
from Data_Cleansing import data_cleansing
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht

#### Load data into Dataframe

In [2]:
# import data
df=import_data()

#### Data Cleansing: Tokenize by white spaces, lowercase, stopword removal

In [3]:
### get first 10.000 entries to test and get features
df_selected = df.sort_index().loc[0:10000]

In [4]:
pd.set_option('display.max_colwidth', 50)
df_selected

Unnamed: 0,English,German
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u..."
2,"Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte..."
3,You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...
4,"In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...
...,...,...
9996,"For example, to accept new roads is to accept ...","Wenn man zum Beispiel neue Straßen akzeptiert,..."
9997,I would also ask the Commission to ensure that...,"Ich fordere auch, daß die Kommission bei der P..."
9998,I am particularly thinking in this respect of ...,Ich denke dabei besonders an bestimmte in der ...
9999,We know that some Member States are still wave...,Bekanntlich haben sich einige Mitgliedstaaten ...


In [5]:
df_sel=data_cleansing(df_selected)

In [6]:
df_sel

Unnamed: 0,English,German
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]"
1,"[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17., dezember, unterbroc..."
2,"[although, ,, seen, ,, dreaded, 'millennium, b...","[feststellen, konnten, ,, gefürchtete, ``, mil..."
3,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla..."
4,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen..."
...,...,...
9996,"[example, ,, accept, new, roads, accept, new, ...","[beispiel, neue, straßen, akzeptiert, ,, akzep..."
9997,"[would, also, ask, commission, ensure, environ...","[fordere, ,, kommission, projektauswahl, darau..."
9998,"[particularly, thinking, respect, certain, inf...","[denke, dabei, besonders, bestimmte, planung, ..."
9999,"[know, member, states, still, wavering, ,, exa...","[bekanntlich, mitgliedstaaten, bahn, straße, d..."


#### Feature Generation

In [7]:
# import string library function 
import string 
#get number of punctuation marks as feature
df_selected['PM_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word in string.punctuation]))
df_selected['PM_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word in string.punctuation]))
#get differences of punctuation marks absolute and relative with respect to smaller number as base
df_selected['PM_dif_abs']=np.abs(df_selected['PM_eng']-df_selected['PM_ger'])
df_selected['PM_dif_rel']=df_selected['PM_dif_abs']/np.minimum(df_selected['PM_eng'],df_selected['PM_ger'])
#set NaNs to zero
df_selected['PM_dif_rel'] = df_selected['PM_dif_rel'].replace(np.nan, 0)
#get number of words as feature
df_selected['Words_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
df_selected['Words_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
#get differences of words absolute and relative with respect to smaller number as base
df_selected['Words_dif_abs']=np.abs(df_selected['Words_eng']-df_selected['Words_ger'])
df_selected['Words_dif_rel']=df_selected['Words_dif_abs']/np.minimum(df_selected['Words_eng'],df_selected['Words_ger'])
#set NaNs to zero
df_selected['Words_dif_rel'] = df_selected['Words_dif_rel'].replace(np.nan, 0)

In [8]:
#number of question marks
marks='?'
df_selected['ger_ques']=df_selected['German'].apply(lambda x: len([word for word in x if word in marks]))
df_selected['eng_ques']=df_selected['English'].apply(lambda x: len([word for word in x if word in marks]))
#number of exclamation marks
marks='!'
df_selected['ger_exc']=df_selected['German'].apply(lambda x: len([word for word in x if word in marks]))
df_selected['eng_exc']=df_selected['English'].apply(lambda x: len([word for word in x if word in marks]))


In [9]:
#get number of characters in words and the average char per word
df_selected['char_eng']=df_selected['English'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_eng_avg']=df_selected['char_eng']/df_selected['Words_eng']
df_selected['char_ger']=df_selected['German'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_ger_avg']=df_selected['char_ger']/df_selected['Words_ger']
#absolute difference between avg
df_selected['char_avg_abs']=df_selected['char_ger_avg']-df_selected['char_eng_avg']

In [10]:
#use pos-tagger and get number of nouns, verbs, adjectives, tagset universal to only get the highest categories and 
df_selected['English_pos']=df_selected.apply(lambda x: nltk.pos_tag(x['English'],tagset='universal'), axis=1)

#pos-tagging with nltk not suppored in german yet -> use pretrained model for german as tagger
##pretrained model for german as tagger
#Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen.

tagger = ht.HanoverTagger('morphmodel_ger.pgz')
ger_pos=[]
for i in df_selected.itertuples():
    ger_pos.append(list(map(lambda x: tagger.analyze(x),i.German)))
df_selected['German_pos']=ger_pos

In [12]:
# df_selected.to_csv('Feature_Generation.csv',index=False)

In [13]:
# count=[]
# universal_pos=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','X']
# for i in df_selected.itertuples():
#     count.append(nltk.FreqDist(tag for (word, tag) in i.German_pos).most_common())
# print(universal_pos)

['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', 'X']


In [4]:
df_selected=pd.read_csv('Feature_Generation.csv')

In [5]:
df_selected

Unnamed: 0,English,German,PM_eng,PM_ger,PM_dif_abs,PM_dif_rel,Words_eng,Words_ger,Words_dif_abs,Words_dif_rel,...,eng_ques,ger_exc,eng_exc,char_eng,char_eng_avg,char_ger,char_ger_avg,char_avg_abs,English_pos,German_pos
0,"['resumption', 'session']","['wiederaufnahme', 'sitzungsperiode']",0,0,0,0.000000,2,2,0,0.000000,...,0,0,0,25,12.500000,37,18.500000,6.000000,"[('resumption', 'NOUN'), ('session', 'NOUN')]","[('Wiederaufnahme', 'NN'), ('Sitzungsperiode',..."
1,"['declare', 'resumed', 'session', 'european', ...","['erkläre', 'freitag', ',', '17.', 'dezember',...",2,4,2,1.000000,21,16,5,0.312500,...,0,0,0,209,9.952381,205,12.812500,2.860119,"[('declare', 'NOUN'), ('resumed', 'VERB'), ('s...","[('erklären', 'VVFIN'), ('Freitag', 'NN'), ('-..."
2,"['although', ',', 'seen', ',', 'dreaded', ""'mi...","['feststellen', 'konnten', ',', 'gefürchtete',...",5,3,2,0.666667,17,14,3,0.214286,...,0,0,0,187,11.000000,184,13.142857,2.142857,"[('although', 'ADP'), (',', '.'), ('seen', 'VE...","[('feststellen', 'VVINF'), ('können', 'VMFIN')..."
3,"['requested', 'debate', 'subject', 'course', '...","['parlament', 'besteht', 'wunsch', 'aussprache...",2,1,1,1.000000,7,8,1,0.142857,...,0,0,0,76,10.857143,99,12.375000,1.517857,"[('requested', 'VERB'), ('debate', 'NOUN'), ('...","[('Parlament', 'NN'), ('bestehen', 'VVPP'), ('..."
4,"['meantime', ',', 'like', 'observe', 'minute',...","['heute', 'möchte', 'bitten', '-', 'wunsch', '...",7,6,1,0.166667,18,15,3,0.200000,...,0,0,0,203,11.277778,185,12.333333,1.055556,"[('meantime', 'ADV'), (',', '.'), ('like', 'AD...","[('heute', 'ADV'), ('möchten', 'VMFIN'), ('bit..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,"['example', ',', 'accept', 'new', 'roads', 'ac...","['beispiel', 'neue', 'straßen', 'akzeptiert', ...",4,5,1,0.250000,19,18,1,0.055556,...,0,0,0,196,10.315789,214,11.888889,1.573099,"[('example', 'NOUN'), (',', '.'), ('accept', '...","[('Beispiel', 'NN'), ('neu', 'ADJA'), ('Straße..."
9997,"['would', 'also', 'ask', 'commission', 'ensure...","['fordere', ',', 'kommission', 'projektauswahl...",2,5,3,1.500000,22,19,3,0.157895,...,0,0,0,235,10.681818,244,12.842105,2.160287,"[('would', 'VERB'), ('also', 'ADV'), ('ask', '...","[('fordern', 'VVFIN'), ('--', '$,'), ('Kommiss..."
9998,"['particularly', 'thinking', 'respect', 'certa...","['denke', 'dabei', 'besonders', 'bestimmte', '...",1,1,0,0.000000,6,7,1,0.166667,...,0,0,0,85,14.166667,89,12.714286,-1.452381,"[('particularly', 'ADV'), ('thinking', 'VERB')...","[('denken', 'VVFIN'), ('dabei', 'PROAV'), ('be..."
9999,"['know', 'member', 'states', 'still', 'waverin...","['bekanntlich', 'mitgliedstaaten', 'bahn', 'st...",4,1,3,3.000000,14,11,3,0.272727,...,0,0,0,139,9.928571,146,13.272727,3.344156,"[('know', 'ADJ'), ('member', 'NOUN'), ('states...","[('bekanntlich', 'ADV'), ('Mitgliedstaat', 'NN..."


In [7]:
universal_pos=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','X']
for u_tag in universal_pos:
    df_selected[f'en_{u_tag}'] = df_selected['English_pos'].apply(lambda row: nltk.FreqDist(tag for (word, tag) in row if tag==u_tag)[u_tag])

ValueError: not enough values to unpack (expected 2, got 1)

Unnamed: 0,English,German,PM_eng,PM_ger,PM_dif_abs,PM_dif_rel,Words_eng,Words_ger,Words_dif_abs,Words_dif_rel,...,en_ADP,en_ADV,en_CONJ,en_DET,en_NOUN,en_NUM,en_PRT,en_PRON,en_VERB,en_X
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",0,0,0,0.000000,2,2,0,0.000000,...,0,0,0,0,2,0,0,0,0,0
1,"[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17., dezember, unterbroc...",2,4,2,1.000000,21,16,5,0.312500,...,0,1,0,0,6,2,0,0,7,0
2,"[although, ,, seen, ,, dreaded, 'millennium, b...","[feststellen, konnten, ,, gefürchtete, ``, mil...",5,3,2,0.666667,17,14,3,0.214286,...,1,2,0,0,7,1,0,0,4,0
3,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...",2,1,1,1.000000,7,8,1,0.142857,...,0,0,0,0,4,0,0,0,1,0
4,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...",7,6,1,0.166667,18,15,3,0.200000,...,1,2,0,0,9,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,"[example, ,, accept, new, roads, accept, new, ...","[beispiel, neue, straßen, akzeptiert, ,, akzep...",4,5,1,0.250000,19,18,1,0.055556,...,1,1,0,0,10,0,0,0,4,0
9997,"[would, also, ask, commission, ensure, environ...","[fordere, ,, kommission, projektauswahl, darau...",2,5,3,1.500000,22,19,3,0.157895,...,0,1,0,0,9,0,1,0,8,0
9998,"[particularly, thinking, respect, certain, inf...","[denke, dabei, besonders, bestimmte, planung, ...",1,1,0,0.000000,6,7,1,0.166667,...,0,1,0,0,3,0,0,0,1,0
9999,"[know, member, states, still, wavering, ,, exa...","[bekanntlich, mitgliedstaaten, bahn, straße, d...",4,1,3,3.000000,14,11,3,0.272727,...,0,1,0,0,8,0,0,0,2,0


In [56]:
count

[[('NN', 2)],
 [('NN', 6),
  ('ADJA', 4),
  ('VVFIN', 3),
  ('$,', 3),
  ('ADJD', 1),
  ('VVPP', 1),
  ('ADV', 1),
  ('$.', 1)],
 [('NN', 5),
  ('ADJA', 2),
  ('FM', 2),
  ('$.', 2),
  ('VVINF', 1),
  ('VMFIN', 1),
  ('$,', 1),
  ('ADJD', 1),
  ('PPOSAT', 1),
  ('VAPP', 1)],
 [('NN', 5), ('VVPP', 1), ('VVFIN', 1), ('ADJA', 1), ('$.', 1)],
 [('NN', 8),
  ('$,', 3),
  ('ADV', 2),
  ('$(', 2),
  ('ADJA', 2),
  ('VMFIN', 1),
  ('VVINF', 1),
  ('VVFIN', 1),
  ('$.', 1)],
 [('ADV', 1), ('$,', 1), ('NN', 1), ('VVINF', 1), ('$.', 1)],
 [('$(', 2), ('NN', 2), ('VVPP', 1), ('$.', 1)],
 [('NN', 3), ('$,', 1), ('$.', 1)],
 [('NN', 4),
  ('NE', 2),
  ('ADJA', 2),
  ('ADV', 1),
  ('$,', 1),
  ('VVFIN', 1),
  ('PIAT', 1),
  ('$.', 1)],
 [('NN', 6),
  ('NE', 4),
  ('$,', 3),
  ('ADJA', 2),
  ('VVFIN', 2),
  ('VVINF', 1),
  ('ADV', 1),
  ('PIAT', 1),
  ('VVPP', 1),
  ('$.', 1)],
 [('NN', 12),
  ('$,', 5),
  ('NE', 5),
  ('VVINF', 4),
  ('ADJA', 4),
  ('VAFIN', 1),
  ('ADJD', 1),
  ('$.', 1)],
 [('$,', 

In [42]:
# ##show problems of question and exclamations marks
# pd.set_option('display.max_colwidth', 500)
# df[2476:2477]
# df[2839:2840]
# df[3091:3092]

Unnamed: 0,English,German
3091,"Could the Commission say whether the Greek Government has made any specific commitments on how to tackle the problems of youth and long-term unemployment, and what these commitments are? Has the government legislated for and put into place a suitable system for pinpointing, recording and monitoring fluctuations in unemployment, or are most of the measures perhaps still limited to counting those out of work?","Kann die Kommission mitteilen, ob die griechische Regierung konkrete Verpflichtungen B und welche B hinsichtlich der Art und Weise übernommen hat, wie das Problem der Jugendarbeitslosigkeit und der Langzeitarbeitslosigkeit zu bewältigen ist und ob sie ein geeignetes System zur Lokalisierung, Registrierung und Kontrolle der Fluktuation der Arbeitslosigkeit festgesetzt hat und umsetzt oder ob sich die meisten Maßnahmen weiterhin auf die Aufzählung der Arbeitslosigkeit beschränken?"


FreqDist({'NOUN': 2, '.': 2, 'ADJ': 1})

Unnamed: 0,English,German
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u..."
