In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import StanfordTagger
from src.data.import_data import import_data
from src.data.clean_data import data_cleansing
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht


#### Load data into Dataframe

In [2]:
# import data
df=import_data()

#### Data Cleansing: Tokenize by white spaces, lowercase, stopword removal

In [3]:
### get first 10.000 entries to test and get features
df_selected = df.sort_index().loc[0:10000]

In [4]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)
df_selected.head(100)

Unnamed: 0,English,German
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fest...","Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie..."
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Wie Sie feststellen konnten, ist der gefürchtete ""Millenium-Bug "" nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden."
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.",Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen.
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the vari...","Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen -, allen Opfern der Stürme, insbesondere in den verschiedenen Ländern der Europäischen Union, in einer Schwei..."
...,...,...
95,There was a vote on this matter.,Es gab eine Abstimmung zu diesem Punkt.
96,"As I recall, the outcome of this vote was 422 votes to 180 with a few abstentions.",Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: 422 gegen 180 Stimmen bei einigen wenigen Enthaltungen.
97,"This means that all the Groups with the exception of the non-attached Members - but, of course, they are not a Group - were in agreement; only your Group thought that we should proceed as you have...","Das heißt, alle Fraktionen, mit Ausnahme der Fraktionslosen - aber die sind ja keine Fraktion - waren sich einig, nur Ihre Fraktion war der Meinung, so zu verfahren, wie Sie es hier vorgeschlagen ..."
98,All of the others were of a different opinion.,Alle anderen waren anderer Meinung.


In [5]:
df_sel=data_cleansing(df_selected)

In [6]:
df_sel

Unnamed: 0,English,German,SWords_eng,SWords_ger
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",2,1
1,"[declare, resumed, session, european, parliament, adjourned, friday, 17, december, 1999, ,, would, like, wish, happy, new, year, hope, enjoyed, pleasant, festive, period, .]","[erkläre, freitag, ,, 17., dezember, unterbrochene, sitzungsperiode, europäischen, parlaments, wiederaufgenommen, ,, wünsche, nochmals, gute, jahreswechsel, hoffe, ,, schöne, ferien, .]",17,13
2,"[although, ,, seen, ,, dreaded, 'millennium, bug, ', failed, materialise, ,, still, people, number, countries, suffered, series, natural, disasters, truly, dreadful, .]","[feststellen, konnten, ,, gefürchtete, ``, millenium-bug, ``, eingetreten, ., bürger, unserer, mitgliedstaaten, opfer, schrecklichen, naturkatastrophen, geworden, .]",14,9
3,"[requested, debate, subject, course, next, days, ,, part-session, .]","[parlament, besteht, wunsch, aussprache, verlauf, sitzungsperiode, nächsten, tagen, .]",12,8
4,"[meantime, ,, like, observe, minute, ', silence, ,, number, members, requested, ,, behalf, victims, concerned, ,, particularly, terrible, storms, ,, various, countries, european, union, .]","[heute, möchte, bitten, -, wunsch, kolleginnen, kollegen, -, ,, opfern, stürme, ,, insbesondere, verschiedenen, ländern, europäischen, union, ,, schweigeminute, gedenken, .]",22,16
...,...,...,...,...
9996,"[example, ,, accept, new, roads, accept, new, pollution, within, european, union, ., goes, totally, policies, proposing, fight, greenhouse, gas, emissions, ,, example, .]","[beispiel, neue, straßen, akzeptiert, ,, akzeptiert, neue, immissionen, europäischen, union, ,, widerspricht, völlig, politik, ,, z, ., b., bekämpfung, emission, treibhausgasen, vorschlagen, .]",13,15
9997,"[would, also, ask, commission, ensure, environmental, pillar, eu, ', policy, integrated, projects, submitted, goal, reducing, co2, example, forms, experimental, added, value, projects, proposed, .]","[fordere, ,, kommission, projektauswahl, darauf, achtet, ,, umweltpolitische, säule, eu-politik, projekte, integriert, ,, z, ., b., ziel, co2-reduktion, spezieller, mehrwert, vorgeschlagenen, proj...",19,19
9998,"[particularly, thinking, respect, certain, infrastructures, construction, .]","[denke, dabei, besonders, bestimmte, planung, befindliche, infrastrukturen, .]",8,4
9999,"[know, member, states, still, wavering, ,, example, ,, rail, road, routes, vulnerable, areas, pyrenees, ,, aspe, valley, .]","[bekanntlich, mitgliedstaaten, bahn, straße, durchquerung, anfälligen, gebieten, pyrenäen, speziell, aspe-tals, entschieden, .]",15,15


#### Feature Generation

In [7]:
# import string library function 
import string 
#get number of punctuation marks as feature, but drop the end of senctence points
list_pm=list(string.punctuation)
list_pm.remove('.')
list_pm.append('...')
df_selected['PM_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word in list_pm]))
df_selected['PM_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word in list_pm]))
#get differences of punctuation marks absolute and relative with respect to smaller number as base
df_selected['PM_dif_abs']=np.abs(df_selected['PM_eng']-df_selected['PM_ger'])
df_selected['PM_dif_rel']=df_selected['PM_dif_abs']/np.minimum(df_selected['PM_eng'],df_selected['PM_ger'])
#set NaNs to zero
df_selected['PM_dif_rel'] = df_selected['PM_dif_rel'].replace(np.nan, 0)
#get number of words as feature
df_selected['Words_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
df_selected['Words_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
#get differences of words absolute and relative with respect to smaller number as base
df_selected['Words_dif_abs']=np.abs(df_selected['Words_eng']-df_selected['Words_ger'])
df_selected['Words_dif_rel']=df_selected['Words_dif_abs']/np.minimum(df_selected['Words_eng'],df_selected['Words_ger'])
#set NaNs to zero
df_selected['Words_dif_rel'] = df_selected['Words_dif_rel'].replace(np.nan, 0)
#get number of unique words
df_selected['Words_eng_unique']=df_selected['English'].apply(lambda x: len(np.unique([word for word in x if word not in string.punctuation])))
df_selected['Words_ger_unique']=df_selected['German'].apply(lambda x: len(np.unique([word for word in x if word not in string.punctuation])))

In [8]:
df_selected

Unnamed: 0,English,German,SWords_eng,SWords_ger,PM_eng,PM_ger,PM_dif_abs,PM_dif_rel,Words_eng,Words_ger,Words_dif_abs,Words_dif_rel,Words_eng_unique,Words_ger_unique
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",2,1,0,0,0,0.0,2,2,0,0.000000,2,2
1,"[declare, resumed, session, european, parliament, adjourned, friday, 17, december, 1999, ,, would, like, wish, happy, new, year, hope, enjoyed, pleasant, festive, period, .]","[erkläre, freitag, ,, 17., dezember, unterbrochene, sitzungsperiode, europäischen, parlaments, wiederaufgenommen, ,, wünsche, nochmals, gute, jahreswechsel, hoffe, ,, schöne, ferien, .]",17,13,1,3,2,2.0,21,16,5,0.312500,21,16
2,"[although, ,, seen, ,, dreaded, 'millennium, bug, ', failed, materialise, ,, still, people, number, countries, suffered, series, natural, disasters, truly, dreadful, .]","[feststellen, konnten, ,, gefürchtete, ``, millenium-bug, ``, eingetreten, ., bürger, unserer, mitgliedstaaten, opfer, schrecklichen, naturkatastrophen, geworden, .]",14,9,4,1,3,3.0,17,14,3,0.214286,17,13
3,"[requested, debate, subject, course, next, days, ,, part-session, .]","[parlament, besteht, wunsch, aussprache, verlauf, sitzungsperiode, nächsten, tagen, .]",12,8,1,0,1,inf,7,8,1,0.142857,7,8
4,"[meantime, ,, like, observe, minute, ', silence, ,, number, members, requested, ,, behalf, victims, concerned, ,, particularly, terrible, storms, ,, various, countries, european, union, .]","[heute, möchte, bitten, -, wunsch, kolleginnen, kollegen, -, ,, opfern, stürme, ,, insbesondere, verschiedenen, ländern, europäischen, union, ,, schweigeminute, gedenken, .]",22,16,6,5,1,0.2,18,15,3,0.200000,18,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,"[example, ,, accept, new, roads, accept, new, pollution, within, european, union, ., goes, totally, policies, proposing, fight, greenhouse, gas, emissions, ,, example, .]","[beispiel, neue, straßen, akzeptiert, ,, akzeptiert, neue, immissionen, europäischen, union, ,, widerspricht, völlig, politik, ,, z, ., b., bekämpfung, emission, treibhausgasen, vorschlagen, .]",13,15,2,3,1,0.5,19,18,1,0.055556,16,16
9997,"[would, also, ask, commission, ensure, environmental, pillar, eu, ', policy, integrated, projects, submitted, goal, reducing, co2, example, forms, experimental, added, value, projects, proposed, .]","[fordere, ,, kommission, projektauswahl, darauf, achtet, ,, umweltpolitische, säule, eu-politik, projekte, integriert, ,, z, ., b., ziel, co2-reduktion, spezieller, mehrwert, vorgeschlagenen, proj...",19,19,1,3,2,2.0,22,19,3,0.157895,21,19
9998,"[particularly, thinking, respect, certain, infrastructures, construction, .]","[denke, dabei, besonders, bestimmte, planung, befindliche, infrastrukturen, .]",8,4,0,0,0,0.0,6,7,1,0.166667,6,7
9999,"[know, member, states, still, wavering, ,, example, ,, rail, road, routes, vulnerable, areas, pyrenees, ,, aspe, valley, .]","[bekanntlich, mitgliedstaaten, bahn, straße, durchquerung, anfälligen, gebieten, pyrenäen, speziell, aspe-tals, entschieden, .]",15,15,3,0,3,inf,14,11,3,0.272727,14,11


In [9]:
#get number of different punctuation marks without the end of sentence point
list_pm=list(string.punctuation)
list_pm.remove('.')
list_pm.append('...')
for mark in list_pm:
    df_selected[f'en_{mark}']=df_selected['English'].apply(lambda x: len([word for word in x if word==mark]))
    df_selected[f'ger_{mark}']=df_selected['German'].apply(lambda x: len([word for word in x if word==mark]))

In [10]:
#get number of characters in words and the average char per word
df_selected['char_eng']=df_selected['English'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_eng_avg']=df_selected['char_eng']/df_selected['Words_eng']
df_selected['char_ger']=df_selected['German'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_ger_avg']=df_selected['char_ger']/df_selected['Words_ger']
#absolute difference between avg
df_selected['char_avg_abs']=df_selected['char_ger_avg']-df_selected['char_eng_avg']

In [11]:
#use pos-tagger and get number of nouns, verbs, adjectives, tagset universal to only get the highest categories and 
df_selected['English_pos']=df_selected.apply(lambda x: nltk.pos_tag(x['English'],tagset='universal'), axis=1)

#pos-tagging with nltk not suppored in german yet -> use pretrained model for german as tagger
##pretrained model for german as tagger
#Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen.

tagger = ht.HanoverTagger('morphmodel_ger.pgz')
ger_pos=[]
for i in df_selected.itertuples():
    ger_pos.append(list(map(lambda x: tagger.analyze(x),i.German)))
df_selected['German_pos']=ger_pos

In [12]:
df_selected

Unnamed: 0,English,German,SWords_eng,SWords_ger,PM_eng,PM_ger,PM_dif_abs,PM_dif_rel,Words_eng,Words_ger,Words_dif_abs,Words_dif_rel,Words_eng_unique,Words_ger_unique,en_!,ger_!,"en_""","ger_""",en_#,ger_#,en_$,ger_$,en_%,ger_%,en_&,ger_&,en_',ger_',en_(,ger_(,en_),ger_),en_*,ger_*,en_+,ger_+,"en_,","ger_,",en_-,ger_-,en_/,ger_/,en_:,ger_:,en_;,ger_;,en_<,ger_<,en_=,ger_=,en_>,ger_>,en_?,ger_?,en_@,ger_@,en_[,ger_[,en_\,ger_\,en_],ger_],en_^,ger_^,en__,ger__,en_`,ger_`,en_{,ger_{,en_|,ger_|,en_},ger_},en_~,ger_~,en_...,ger_...,char_eng,char_eng_avg,char_ger,char_ger_avg,char_avg_abs,English_pos,German_pos
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",2,1,0,0,0,0.0,2,2,0,0.000000,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,12.500000,37,18.500000,6.000000,"[(resumption, NOUN), (session, NOUN)]","[(Wiederaufnahme, NN), (Sitzungsperiode, NN)]"
1,"[declare, resumed, session, european, parliament, adjourned, friday, 17, december, 1999, ,, would, like, wish, happy, new, year, hope, enjoyed, pleasant, festive, period, .]","[erkläre, freitag, ,, 17., dezember, unterbrochene, sitzungsperiode, europäischen, parlaments, wiederaufgenommen, ,, wünsche, nochmals, gute, jahreswechsel, hoffe, ,, schöne, ferien, .]",17,13,1,3,2,2.0,21,16,5,0.312500,21,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,209,9.952381,205,12.812500,2.860119,"[(declare, NOUN), (resumed, VERB), (session, NOUN), (european, ADJ), (parliament, NOUN), (adjourned, VERB), (friday, ADV), (17, NUM), (december, NOUN), (1999, NUM), (,, .), (would, VERB), (like, V...","[(erklären, VVFIN), (Freitag, NN), (--, $,), (17., ADJD), (Dezember, NN), (unterbrochen, ADJA), (Sitzungsperiode, NN), (europäisch, ADJA), (Parlament, NN), (wiederaufnehmen, VVPP), (--, $,), (wüns..."
2,"[although, ,, seen, ,, dreaded, 'millennium, bug, ', failed, materialise, ,, still, people, number, countries, suffered, series, natural, disasters, truly, dreadful, .]","[feststellen, konnten, ,, gefürchtete, ``, millenium-bug, ``, eingetreten, ., bürger, unserer, mitgliedstaaten, opfer, schrecklichen, naturkatastrophen, geworden, .]",14,9,4,1,3,3.0,17,14,3,0.214286,17,13,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,187,11.000000,184,13.142857,2.142857,"[(although, ADP), (,, .), (seen, VERB), (,, .), (dreaded, VERB), ('millennium, NUM), (bug, NOUN), (', .), (failed, VERB), (materialise, NOUN), (,, .), (still, ADV), (people, NOUN), (number, NOUN),...","[(feststellen, VVINF), (können, VMFIN), (--, $,), (gefürchtet, ADJA), (``, FM), (Millenium-bug, NN), (``, FM), (eingetreten, ADJD), (--, $.), (Bürger, NN), (unser, PPOSAT), (Mitgliedstaat, NN), (O..."
3,"[requested, debate, subject, course, next, days, ,, part-session, .]","[parlament, besteht, wunsch, aussprache, verlauf, sitzungsperiode, nächsten, tagen, .]",12,8,1,0,1,inf,7,8,1,0.142857,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76,10.857143,99,12.375000,1.517857,"[(requested, VERB), (debate, NOUN), (subject, ADJ), (course, NOUN), (next, ADJ), (days, NOUN), (,, .), (part-session, NOUN), (., .)]","[(Parlament, NN), (bestehen, VVPP), (Wunsch, NN), (aussprechen, VVFIN), (Verlauf, NN), (Sitzungsperiode, NN), (nächster, ADJA), (Tag, NN), (--, $.)]"
4,"[meantime, ,, like, observe, minute, ', silence, ,, number, members, requested, ,, behalf, victims, concerned, ,, particularly, terrible, storms, ,, various, countries, european, union, .]","[heute, möchte, bitten, -, wunsch, kolleginnen, kollegen, -, ,, opfern, stürme, ,, insbesondere, verschiedenen, ländern, europäischen, union, ,, schweigeminute, gedenken, .]",22,16,6,5,1,0.2,18,15,3,0.200000,18,15,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,203,11.277778,185,12.333333,1.055556,"[(meantime, ADV), (,, .), (like, ADP), (observe, VERB), (minute, NOUN), (', .), (silence, NOUN), (,, .), (number, NOUN), (members, NOUN), (requested, VERB), (,, .), (behalf, NOUN), (victims, NOUN)...","[(heute, ADV), (möchten, VMFIN), (bitten, VVINF), (--, $(), (Wunsch, NN), (Kollegin, NN), (Kollege, NN), (--, $(), (--, $,), (Opfer, NN), (stürmen, VVFIN), (--, $,), (insbesondere, ADV), (verschie..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,"[example, ,, accept, new, roads, accept, new, pollution, within, european, union, ., goes, totally, policies, proposing, fight, greenhouse, gas, emissions, ,, example, .]","[beispiel, neue, straßen, akzeptiert, ,, akzeptiert, neue, immissionen, europäischen, union, ,, widerspricht, völlig, politik, ,, z, ., b., bekämpfung, emission, treibhausgasen, vorschlagen, .]",13,15,2,3,1,0.5,19,18,1,0.055556,16,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,10.315789,214,11.888889,1.573099,"[(example, NOUN), (,, .), (accept, VERB), (new, ADJ), (roads, NOUN), (accept, VERB), (new, ADJ), (pollution, NOUN), (within, ADP), (european, ADJ), (union, NOUN), (., .), (goes, VERB), (totally, A...","[(Beispiel, NN), (neu, ADJA), (Straße, NN), (akzeptieren, VVFIN), (--, $,), (akzeptieren, VVFIN), (neu, ADJA), (Immission, NN), (europäisch, ADJA), (Union, NN), (--, $,), (widersprechen, VVFIN), (..."
9997,"[would, also, ask, commission, ensure, environmental, pillar, eu, ', policy, integrated, projects, submitted, goal, reducing, co2, example, forms, experimental, added, value, projects, proposed, .]","[fordere, ,, kommission, projektauswahl, darauf, achtet, ,, umweltpolitische, säule, eu-politik, projekte, integriert, ,, z, ., b., ziel, co2-reduktion, spezieller, mehrwert, vorgeschlagenen, proj...",19,19,1,3,2,2.0,22,19,3,0.157895,21,19,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,10.681818,244,12.842105,2.160287,"[(would, VERB), (also, ADV), (ask, VERB), (commission, NOUN), (ensure, VERB), (environmental, ADJ), (pillar, ADJ), (eu, NOUN), (', PRT), (policy, NOUN), (integrated, VERB), (projects, NOUN), (subm...","[(fordern, VVFIN), (--, $,), (Kommission, NN), (Projektauswahl, NN), (darauf, PROAV), (achten, VVFIN), (--, $,), (umweltpolitisch, ADJA), (Säule, NN), (Eu-politik, NN), (Projekt, NN), (integrieren..."
9998,"[particularly, thinking, respect, certain, infrastructures, construction, .]","[denke, dabei, besonders, bestimmte, planung, befindliche, infrastrukturen, .]",8,4,0,0,0,0.0,6,7,1,0.166667,6,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,14.166667,89,12.714286,-1.452381,"[(particularly, ADV), (thinking, VERB), (respect, NOUN), (certain, ADJ), (infrastructures, NOUN), (construction, NOUN), (., .)]","[(denken, VVFIN), (dabei, PROAV), (besonders, ADV), (bestimmen, VVFIN), (Planung, NN), (befindlich, ADJA), (Infrastruktur, NN), (--, $.)]"
9999,"[know, member, states, still, wavering, ,, example, ,, rail, road, routes, vulnerable, areas, pyrenees, ,, aspe, valley, .]","[bekanntlich, mitgliedstaaten, bahn, straße, durchquerung, anfälligen, gebieten, pyrenäen, speziell, aspe-tals, entschieden, .]",15,15,3,0,3,inf,14,11,3,0.272727,14,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,9.928571,146,13.272727,3.344156,"[(know, ADJ), (member, NOUN), (states, NOUN), (still, ADV), (wavering, VERB), (,, .), (example, NOUN), (,, .), (rail, VERB), (road, NOUN), (routes, NOUN), (vulnerable, ADJ), (areas, NOUN), (pyrene...","[(bekanntlich, ADV), (Mitgliedstaat, NN), (Bahn, NN), (Straße, NN), (Durchquerung, NN), (anfällig, ADJA), (Gebiet, NN), (Pyrenäen, NE), (speziell, ADJD), (aspe-tals, ADV), (entschieden, ADJD), (--..."


In [13]:
universal_pos=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','X']
for u_tag in universal_pos:
    df_selected[f'en_{u_tag}'] = df_selected['English_pos'].apply(lambda row: nltk.FreqDist(tag for (word, tag) in row if tag==u_tag)[u_tag])

In [14]:
#Stuttgart, Tübingen Tagset
#https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.635.8431&rep=rep1&type=pdf
#get German Tags
tags_german=['ADJA','ADJD','ADV','APPR','APPRART','APPO','APZR','ART','CARD','FM','ITJ','KOUI','KOUS','KON','KOKOM','NN','NE','PDS','PDAT','PIS','PIAT','PIDAT','PPER','PPOSS','PPOSAT','PRELS','PRELAT','PRF','PWS','PWAT','PWAV','PAV','PTKZU','PTKNEG','PTKVZ','PTKANT','PTKA','TRUNC','VVFIN','VVIMP','VVINF','VVIZU','VVPP','VAFIN','VAIMP','VAINF','VAPP','VMFIN','VMINF','VMPP','XY']

#one hot encoding of german tags
for u_tag in tags_german:
    df_selected[f'ger_{u_tag}'] = df_selected['German_pos'].apply(lambda row: nltk.FreqDist(tag for (word, tag) in row if tag==u_tag)[u_tag])



In [16]:
#combine the different german tag categories into the universal tags
df_selected['ger_ADJ']=df_selected['ger_ADJA']+df_selected['ger_ADJD']
df_selected['ger_ADP']=df_selected['ger_APPR']+df_selected['ger_APPRART']+df_selected['ger_APPO']+df_selected['ger_APZR']
df_selected['ger_CONJ']=df_selected['ger_KOUI']+df_selected['ger_KOUS']+df_selected['ger_KON']+df_selected['ger_KOKOM']
df_selected['ger_DET']=df_selected['ger_ART']
df_selected['ger_NOUN']=df_selected['ger_NN']+df_selected['ger_NE']
df_selected['ger_NUM']=df_selected['ger_CARD']
df_selected['ger_PRT']=df_selected['ger_PTKZU']+df_selected['ger_PTKNEG']+df_selected['ger_PTKVZ']+df_selected['ger_PTKANT']+df_selected['ger_PTKA']
df_selected['ger_PRON']=df_selected['ger_PDS']+df_selected['ger_PDAT']+df_selected['ger_PIS']+df_selected['ger_PIAT']+df_selected['ger_PIDAT']+df_selected['ger_PPER']+df_selected['ger_PPOSS']+df_selected['ger_PPOSAT']+df_selected['ger_PRELS']+df_selected['ger_PRELAT']+df_selected['ger_PRF']+df_selected['ger_PWS']+df_selected['ger_PWAT']+df_selected['ger_PWAV']+df_selected['ger_PAV']
df_selected['ger_VERB']=df_selected['ger_VVFIN']+df_selected['ger_VVIMP']+df_selected['ger_VVINF']+df_selected['ger_VVIZU']+df_selected['ger_VVPP']+df_selected['ger_VAFIN']+df_selected['ger_VAIMP']+df_selected['ger_VAINF']+df_selected['ger_VAPP']+df_selected['ger_VMFIN']+df_selected['ger_VMINF']+df_selected['ger_VMPP']
df_selected['ger_X']=df_selected['ger_FM']+df_selected['ger_ITJ']+df_selected['ger_TRUNC']+df_selected['ger_XY']

# #delete german tag columns without the Adverb cause the column name and the column itself do not get changed
# tags_german_1=['ADJA','ADJD','APPR','APPRART','APPO','APZR','ART','CARD','FM','ITJ','KOUI','KOUS','KON','KOKOM','NN','NE','PDS','PDAT','PIS','PIAT','PIDAT','PPER','PPOSS','PPOSAT','PRELS','PRELAT','PRF','PWS','PWAT','PWAV','PAV','PTKZU','PTKNEG','PTKVZ','PTKANT','PTKA','TRUNC','VVFIN','VVIMP','VVINF','VVIZU','VVPP','VAFIN','VAIMP','VAINF','VAPP','VMFIN','VMINF','VMPP','XY']
# for u_tag in tags_german_1:
#     del df_selected[f'ger_{u_tag}']

In [17]:
df_selected

Unnamed: 0,English,German,SWords_eng,SWords_ger,PM_eng,PM_ger,PM_dif_abs,PM_dif_rel,Words_eng,Words_ger,Words_dif_abs,Words_dif_rel,Words_eng_unique,Words_ger_unique,en_!,ger_!,"en_""","ger_""",en_#,ger_#,en_$,ger_$,en_%,ger_%,en_&,ger_&,en_',ger_',en_(,ger_(,en_),ger_),en_*,ger_*,en_+,ger_+,"en_,","ger_,",en_-,ger_-,en_/,ger_/,en_:,ger_:,en_;,ger_;,en_<,ger_<,en_=,ger_=,en_>,ger_>,en_?,ger_?,en_@,ger_@,en_[,ger_[,en_\,ger_\,en_],ger_],en_^,ger_^,en__,ger__,en_`,ger_`,en_{,ger_{,en_|,ger_|,en_},ger_},en_~,ger_~,en_...,ger_...,char_eng,char_eng_avg,char_ger,char_ger_avg,char_avg_abs,English_pos,German_pos,en_ADJ,en_ADP,en_ADV,en_CONJ,en_DET,en_NOUN,en_NUM,en_PRT,en_PRON,en_VERB,en_X,ger_ADV,ger_ADJ,ger_ADP,ger_CONJ,ger_DET,ger_NOUN,ger_NUM,ger_PRT,ger_PRON,ger_VERB,ger_X
0,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",2,1,0,0,0,0.0,2,2,0,0.000000,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,12.500000,37,18.500000,6.000000,"[(resumption, NOUN), (session, NOUN)]","[(Wiederaufnahme, NN), (Sitzungsperiode, NN)]",0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0
1,"[declare, resumed, session, european, parliament, adjourned, friday, 17, december, 1999, ,, would, like, wish, happy, new, year, hope, enjoyed, pleasant, festive, period, .]","[erkläre, freitag, ,, 17., dezember, unterbrochene, sitzungsperiode, europäischen, parlaments, wiederaufgenommen, ,, wünsche, nochmals, gute, jahreswechsel, hoffe, ,, schöne, ferien, .]",17,13,1,3,2,2.0,21,16,5,0.312500,21,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,209,9.952381,205,12.812500,2.860119,"[(declare, NOUN), (resumed, VERB), (session, NOUN), (european, ADJ), (parliament, NOUN), (adjourned, VERB), (friday, ADV), (17, NUM), (december, NOUN), (1999, NUM), (,, .), (would, VERB), (like, V...","[(erklären, VVFIN), (Freitag, NN), (--, $,), (17., ADJD), (Dezember, NN), (unterbrochen, ADJA), (Sitzungsperiode, NN), (europäisch, ADJA), (Parlament, NN), (wiederaufnehmen, VVPP), (--, $,), (wüns...",5,0,1,0,0,6,2,0,0,7,0,1,5,0,0,0,6,0,0,0,4,0
2,"[although, ,, seen, ,, dreaded, 'millennium, bug, ', failed, materialise, ,, still, people, number, countries, suffered, series, natural, disasters, truly, dreadful, .]","[feststellen, konnten, ,, gefürchtete, ``, millenium-bug, ``, eingetreten, ., bürger, unserer, mitgliedstaaten, opfer, schrecklichen, naturkatastrophen, geworden, .]",14,9,4,1,3,3.0,17,14,3,0.214286,17,13,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,187,11.000000,184,13.142857,2.142857,"[(although, ADP), (,, .), (seen, VERB), (,, .), (dreaded, VERB), ('millennium, NUM), (bug, NOUN), (', .), (failed, VERB), (materialise, NOUN), (,, .), (still, ADV), (people, NOUN), (number, NOUN),...","[(feststellen, VVINF), (können, VMFIN), (--, $,), (gefürchtet, ADJA), (``, FM), (Millenium-bug, NN), (``, FM), (eingetreten, ADJD), (--, $.), (Bürger, NN), (unser, PPOSAT), (Mitgliedstaat, NN), (O...",2,1,2,0,0,7,1,0,0,4,0,0,3,0,0,0,5,0,0,1,3,2
3,"[requested, debate, subject, course, next, days, ,, part-session, .]","[parlament, besteht, wunsch, aussprache, verlauf, sitzungsperiode, nächsten, tagen, .]",12,8,1,0,1,inf,7,8,1,0.142857,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76,10.857143,99,12.375000,1.517857,"[(requested, VERB), (debate, NOUN), (subject, ADJ), (course, NOUN), (next, ADJ), (days, NOUN), (,, .), (part-session, NOUN), (., .)]","[(Parlament, NN), (bestehen, VVPP), (Wunsch, NN), (aussprechen, VVFIN), (Verlauf, NN), (Sitzungsperiode, NN), (nächster, ADJA), (Tag, NN), (--, $.)]",2,0,0,0,0,4,0,0,0,1,0,0,1,0,0,0,5,0,0,0,2,0
4,"[meantime, ,, like, observe, minute, ', silence, ,, number, members, requested, ,, behalf, victims, concerned, ,, particularly, terrible, storms, ,, various, countries, european, union, .]","[heute, möchte, bitten, -, wunsch, kolleginnen, kollegen, -, ,, opfern, stürme, ,, insbesondere, verschiedenen, ländern, europäischen, union, ,, schweigeminute, gedenken, .]",22,16,6,5,1,0.2,18,15,3,0.200000,18,15,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,203,11.277778,185,12.333333,1.055556,"[(meantime, ADV), (,, .), (like, ADP), (observe, VERB), (minute, NOUN), (', .), (silence, NOUN), (,, .), (number, NOUN), (members, NOUN), (requested, VERB), (,, .), (behalf, NOUN), (victims, NOUN)...","[(heute, ADV), (möchten, VMFIN), (bitten, VVINF), (--, $(), (Wunsch, NN), (Kollegin, NN), (Kollege, NN), (--, $(), (--, $,), (Opfer, NN), (stürmen, VVFIN), (--, $,), (insbesondere, ADV), (verschie...",3,1,2,0,0,9,0,0,0,3,0,2,2,0,0,0,8,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,"[example, ,, accept, new, roads, accept, new, pollution, within, european, union, ., goes, totally, policies, proposing, fight, greenhouse, gas, emissions, ,, example, .]","[beispiel, neue, straßen, akzeptiert, ,, akzeptiert, neue, immissionen, europäischen, union, ,, widerspricht, völlig, politik, ,, z, ., b., bekämpfung, emission, treibhausgasen, vorschlagen, .]",13,15,2,3,1,0.5,19,18,1,0.055556,16,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,10.315789,214,11.888889,1.573099,"[(example, NOUN), (,, .), (accept, VERB), (new, ADJ), (roads, NOUN), (accept, VERB), (new, ADJ), (pollution, NOUN), (within, ADP), (european, ADJ), (union, NOUN), (., .), (goes, VERB), (totally, A...","[(Beispiel, NN), (neu, ADJA), (Straße, NN), (akzeptieren, VVFIN), (--, $,), (akzeptieren, VVFIN), (neu, ADJA), (Immission, NN), (europäisch, ADJA), (Union, NN), (--, $,), (widersprechen, VVFIN), (...",3,1,1,0,0,10,0,0,0,4,0,0,4,0,0,0,11,0,0,0,3,0
9997,"[would, also, ask, commission, ensure, environmental, pillar, eu, ', policy, integrated, projects, submitted, goal, reducing, co2, example, forms, experimental, added, value, projects, proposed, .]","[fordere, ,, kommission, projektauswahl, darauf, achtet, ,, umweltpolitische, säule, eu-politik, projekte, integriert, ,, z, ., b., ziel, co2-reduktion, spezieller, mehrwert, vorgeschlagenen, proj...",19,19,1,3,2,2.0,22,19,3,0.157895,21,19,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,10.681818,244,12.842105,2.160287,"[(would, VERB), (also, ADV), (ask, VERB), (commission, NOUN), (ensure, VERB), (environmental, ADJ), (pillar, ADJ), (eu, NOUN), (', PRT), (policy, NOUN), (integrated, VERB), (projects, NOUN), (subm...","[(fordern, VVFIN), (--, $,), (Kommission, NN), (Projektauswahl, NN), (darauf, PROAV), (achten, VVFIN), (--, $,), (umweltpolitisch, ADJA), (Säule, NN), (Eu-politik, NN), (Projekt, NN), (integrieren...",4,0,1,0,0,9,0,1,0,8,0,0,3,0,0,0,11,0,0,0,4,0
9998,"[particularly, thinking, respect, certain, infrastructures, construction, .]","[denke, dabei, besonders, bestimmte, planung, befindliche, infrastrukturen, .]",8,4,0,0,0,0.0,6,7,1,0.166667,6,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,14.166667,89,12.714286,-1.452381,"[(particularly, ADV), (thinking, VERB), (respect, NOUN), (certain, ADJ), (infrastructures, NOUN), (construction, NOUN), (., .)]","[(denken, VVFIN), (dabei, PROAV), (besonders, ADV), (bestimmen, VVFIN), (Planung, NN), (befindlich, ADJA), (Infrastruktur, NN), (--, $.)]",1,0,1,0,0,3,0,0,0,1,0,1,1,0,0,0,2,0,0,0,2,0
9999,"[know, member, states, still, wavering, ,, example, ,, rail, road, routes, vulnerable, areas, pyrenees, ,, aspe, valley, .]","[bekanntlich, mitgliedstaaten, bahn, straße, durchquerung, anfälligen, gebieten, pyrenäen, speziell, aspe-tals, entschieden, .]",15,15,3,0,3,inf,14,11,3,0.272727,14,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,9.928571,146,13.272727,3.344156,"[(know, ADJ), (member, NOUN), (states, NOUN), (still, ADV), (wavering, VERB), (,, .), (example, NOUN), (,, .), (rail, VERB), (road, NOUN), (routes, NOUN), (vulnerable, ADJ), (areas, NOUN), (pyrene...","[(bekanntlich, ADV), (Mitgliedstaat, NN), (Bahn, NN), (Straße, NN), (Durchquerung, NN), (anfällig, ADJA), (Gebiet, NN), (Pyrenäen, NE), (speziell, ADJD), (aspe-tals, ADV), (entschieden, ADJD), (--...",3,0,1,0,0,8,0,0,0,2,0,2,3,0,0,0,6,0,0,0,0,0


In [None]:

# ##show problems of question and exclamations marks
# pd.set_option('display.max_colwidth', 500)
# df[2476:2477]
# df[2839:2840]
# df[3091:3092]
