In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import StanfordTagger
from src.data.import_data import import_data
from src.data.clean_data import data_cleansing
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht


#### Load data into Dataframe

In [None]:
# import data
df=import_data()

#### Data Cleansing: Tokenize by white spaces, lowercase, stopword removal

In [None]:
### get first 10.000 entries to test and get features
df_selected = df.sort_index().loc[0:10000]

In [None]:
pd.set_option('display.max_colwidth', 200)
df_selected.head(100)

In [None]:
df_sel=data_cleansing(df_selected)

In [None]:
df_sel

#### Feature Generation

In [None]:
# import string library function 
import string 
#get number of punctuation marks as feature
df_selected['PM_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word in string.punctuation]))
df_selected['PM_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word in string.punctuation]))
#get differences of punctuation marks absolute and relative with respect to smaller number as base
df_selected['PM_dif_abs']=np.abs(df_selected['PM_eng']-df_selected['PM_ger'])
df_selected['PM_dif_rel']=df_selected['PM_dif_abs']/np.minimum(df_selected['PM_eng'],df_selected['PM_ger'])
#set NaNs to zero
df_selected['PM_dif_rel'] = df_selected['PM_dif_rel'].replace(np.nan, 0)
#get number of words as feature
df_selected['Words_eng']=df_selected['English'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
df_selected['Words_ger']=df_selected['German'].apply(lambda x: len([word for word in x if word not in string.punctuation]))
#get differences of words absolute and relative with respect to smaller number as base
df_selected['Words_dif_abs']=np.abs(df_selected['Words_eng']-df_selected['Words_ger'])
df_selected['Words_dif_rel']=df_selected['Words_dif_abs']/np.minimum(df_selected['Words_eng'],df_selected['Words_ger'])
#set NaNs to zero
df_selected['Words_dif_rel'] = df_selected['Words_dif_rel'].replace(np.nan, 0)

In [None]:
punctuation_marks=['?,!,']
#number of question marks
marks='?'
df_selected['ger_ques']=df_selected['German'].apply(lambda x: len([word for word in x if word in marks]))
df_selected['eng_ques']=df_selected['English'].apply(lambda x: len([word for word in x if word in marks]))
#number of exclamation marks
marks='!'
df_selected['ger_exc']=df_selected['German'].apply(lambda x: len([word for word in x if word in marks]))
df_selected['eng_exc']=df_selected['English'].apply(lambda x: len([word for word in x if word in marks]))


In [None]:
#get number of characters in words and the average char per word
df_selected['char_eng']=df_selected['English'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_eng_avg']=df_selected['char_eng']/df_selected['Words_eng']
df_selected['char_ger']=df_selected['German'].apply(lambda x: len(str([word for word in x if word not in string.punctuation])))
df_selected['char_ger_avg']=df_selected['char_ger']/df_selected['Words_ger']
#absolute difference between avg
df_selected['char_avg_abs']=df_selected['char_ger_avg']-df_selected['char_eng_avg']

In [None]:
#use pos-tagger and get number of nouns, verbs, adjectives, tagset universal to only get the highest categories and 
df_selected['English_pos']=df_selected.apply(lambda x: nltk.pos_tag(x['English'],tagset='universal'), axis=1)

#pos-tagging with nltk not suppored in german yet -> use pretrained model for german as tagger
##pretrained model for german as tagger
#Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen.

tagger = ht.HanoverTagger('morphmodel_ger.pgz')
ger_pos=[]
for i in df_selected.itertuples():
    ger_pos.append(list(map(lambda x: tagger.analyze(x),i.German)))
df_selected['German_pos']=ger_pos

In [None]:
# df_selected.to_csv('Feature_Generation.csv',index=False)

In [None]:
# count=[]
# universal_pos=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','X']
# for i in df_selected.itertuples():
#     count.append(nltk.FreqDist(tag for (word, tag) in i.German_pos).most_common())
# print(universal_pos)

In [None]:
df_selected=pd.read_csv('Feature_Generation.csv')

In [None]:
df_selected

In [None]:
universal_pos=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','X']
for u_tag in universal_pos:
    df_selected[f'en_{u_tag}'] = df_selected['English_pos'].apply(lambda row: nltk.FreqDist(tag for (word, tag) in row if tag==u_tag)[u_tag])

In [None]:
#Stuttgart, Tübingen Tagset
#https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.635.8431&rep=rep1&type=pdf
#get German Tags
tags_german=['ADJA','ADJD','ADV','APPR','APPRART','APPO','APZR','ART','CARD','FM','ITJ','KOUI','KOUS','KON','KOKOM','NN','NE','PDS','PDAT','PIS','PIAT','PIDAT','PPER','PPOSS','PPOSAT','PRELS','PRELAT','PRF','PWS','PWAT','PWAV','PAV','PTKZU','PTKNEG','PTKVZ','PTKANT','PTKA','TRUNC','VVFIN','VVIMP','VVINF','VVIZU','VVPP','VAFIN','VAIMP','VAINF','VAPP','VMFIN','VMINF','VMPP','XY']

#one hot encoding of german tags
for u_tag in tags_german:
    df_selected[f'ger_{u_tag}'] = df_selected['German_pos'].apply(lambda row: nltk.FreqDist(tag for (word, tag) in row if tag==u_tag)[u_tag])



In [None]:
#combine the different german tag categories into the universal tags
df_selected['ger_ADJ']=df_selected['ger_ADJA']+df_selected['ger_ADJD']
df_selected['ger_ADP']=df_selected['ger_APPR']+df_selected['ger_APPRART']+df_selected['ger_APPO']+df_selected['ger_APZR']
df_selected['ger_CONJ']=df_selected['ger_KOUI']+df_selected['ger_KOUS']+df_selected['ger_KON']+df_selected['ger_KOKOM']
df_selected['ger_DET']=df_selected['ger_ART']
df_selected['ger_NOUN']=df_selected['ger_NN']+df_selected['ger_NE']
df_selected['ger_NUM']=df_selected['ger_CARD']
df_selected['ger_PRT']=df_selected['ger_PTKZU']+df_selected['ger_PTKNEG']+df_selected['ger_PTKVZ']+df_selected['ger_PTKANT']+df_selected['ger_PTKA']
df_selected['ger_PRON']=df_selected['ger_PDS']+df_selected['ger_PDAT']+df_selected['ger_PIS']+df_selected['ger_PIAT']+df_selected['ger_PIDAT']+df_selected['ger_PPER']+df_selected['ger_PPOSS']+df_selected['ger_PPOSAT']+df_selected['ger_PRELS']+df_selected['ger_PRELAT']+df_selected['ger_PRF']+df_selected['ger_PWS']+df_selected['ger_PWAT']+df_selected['ger_PWAV']+df_selected['ger_PAV']
df_selected['ger_VERB']=df_selected['ger_VVFIN']+df_selected['ger_VVIMP']+df_selected['ger_VVINF']+df_selected['ger_VVIZU']+df_selected['ger_VVPP']+df_selected['ger_VAFIN']+df_selected['ger_VAIMP']+df_selected['ger_VAINF']+df_selected['ger_VAPP']+df_selected['ger_VMFIN']+df_selected['ger_VMINF']+df_selected['ger_VMPP']
df_selected['ger_X']=df_selected['ger_FM']+df_selected['ger_ITJ']+df_selected['ger_TRUNC']+df_selected['ger_XY']

#delete german tag columns without the Adverb cause the column name and the column itself do not get changed
#tags_german_1=['ADJA','ADJD','APPR','APPRART','APPO','APZR','ART','CARD','FM','ITJ','KOUI','KOUS','KON','KOKOM','NN','NE','PDS','PDAT','PIS','PIAT','PIDAT','PPER','PPOSS','PPOSAT','PRELS','PRELAT','PRF','PWS','PWAT','PWAV','PAV','PTKZU','PTKNEG','PTKVZ','PTKANT','PTKA','TRUNC','VVFIN','VVIMP','VVINF','VVIZU','VVPP','VAFIN','VAIMP','VAINF','VAPP','VMFIN','VMINF','VMPP','XY']
#for u_tag in tags_german_1:
#    del df_selected[f'ger_{u_tag}']


In [None]:
# ##show problems of question and exclamations marks
# pd.set_option('display.max_colwidth', 500)
# df[2476:2477]
# df[2839:2840]
# df[3091:3092]