In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import Classification.config as cfg
import csv
import nltk
import re
import random
import warnings
import spacy
from nltk.util import ngrams
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
import pickle
from ClassifierBasedGermanTagger import ClassifierBasedGermanTagger


In [74]:
def clean(string):
    #return string
    clean_string = string.replace(u'\xa0', u' ')
    clean_string = re.sub(r'\d+', 'number', clean_string)
    clean_string = re.sub(r'Ä', 'Ae', clean_string)
    clean_string = re.sub(r'ä', 'ae', clean_string)
    clean_string = re.sub(r'Ö', 'Oe', clean_string)
    clean_string = re.sub(r'ö', 'oe', clean_string)
    clean_string = re.sub(r'Ü', 'Ue', clean_string)
    clean_string = re.sub(r'ü', 'ue', clean_string)
    clean_string = re.sub(r'ß', 'ss', clean_string)
    clean_string = re.sub(r'°', 'Grad', clean_string)
    clean_string = re.sub(r'[Zz][Bb]', 'zum Beispiel', clean_string)
    clean_string = re.sub(r'[Dd][Hh]', 'das heißt', clean_string)
    clean_string = re.sub(r'[Bb][Ss][Pp][Ww]', 'beispielsweise', clean_string)
    clean_string = re.sub(r'[Hh]allo', '', clean_string)
    clean_string = re.sub(r'[Hh]i', '', clean_string)
    clean_string = re.sub(r'[Hh]ey', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Mm]orgen', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Aa]bend', '', clean_string)
    
    clean_string = re.sub(r'(\([^)]*\))', ' ', clean_string)
    clean_string = re.sub(r'"', '', clean_string)
    clean_string = re.sub(r'\+', '', clean_string)
    clean_string = re.sub(r'-', '', clean_string)
    clean_string = re.sub(r',', '', clean_string)

    clean_string = re.sub(r'\'', '', clean_string)
    clean_string = re.sub(r'\.', '', clean_string)
    clean_string = re.sub(r'\s{2,}', ' ', clean_string)
    clean_string = re.sub(r'\s(?=\?)', ' ', clean_string)
    clean_string = re.sub(r'\?*(?=(?:\?))', '', clean_string)
    clean_string = clean_string.strip()
    try:
        clean_string = clean_string.split(' ', 1)[0].capitalize() + ' ' + clean_string.split(' ', 1)[1]
    except:
        pass
    #bitte, danke, und, eigentlich, überhaupt, git, wirklich
    return clean_string#.lower()

In [65]:
nlp = spacy.load('de')

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [75]:
data_set = []
for file in cfg.ALL_FILES:
    reader = csv.reader(open(file, 'r'), delimiter=';')
    for line in reader:
        try:
            data_set.append([lemmatizer(clean(line[0])), line[1]])
        except Exception as e:
            print(e)
            print(line)
    #print(file)
    #print(len(data_set))

In [76]:
words_bag = set(word for passage in data_set for word in nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]))
ngrams_bag = set(gram for passage in data_set for gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]), 2))

In [77]:
print('Length dataset:', len(data_set))
print('Size bag of words:', len(words_bag))
print('Size bag of ngrams:', len(ngrams_bag))

Length dataset: 5012
Size bag of words: 6305
Size bag of ngrams: 19905


In [78]:
data_set_frame = pd.DataFrame(data_set)
data_set_frame['PosTags'] = ''

In [79]:
for i, line in enumerate(data_set_frame[0]):
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(line)
    tag_line = []
    for tag in tagger.tag(sent):
        tag_line.append(tag[1])
    data_set_frame.at[i, 'PosTags'] = tag_line

0       [PPOSAT, ART, ADJA, APPR, APPR, ART, ADJA, NN,...
1           [VMFIN, ART, ADJA, ADJA, ADV, ADV, VVINF, $.]
2       [PPOSAT, ADV, PROAV, ADV, ART, NN, NN, VVINF, $.]
3       [VVFIN, ART, NN, APPR, ART, NN, KON, APPR, ART...
4                      [PWAV, ADJA, ADJA, ART, TRUNC, $.]
5       [VAFIN, PIS, NN, KOKOM, ADJD, ART, NN, VVFIN, ...
6                        [PWAV, ADJD, VAINF, ART, NN, $.]
7       [PWAV, PPOSAT, ADV, ART, ADJD, NN, ART, NN, AP...
8                       [PPOSAT, ART, NN, APPR, ADJA, $.]
9       [VMFIN, ADV, ART, NN, ADV, APPR, ART, NN, APPR...
10               [PWS, VVFIN, KOUS, ART, ADJA, VAINF, $.]
11                     [VMFIN, PPER, ART, ADV, VVFIN, $.]
12                 [PWAV, ADJD, VAINF, ART, NN, ADJD, $.]
13                                [VAFIN, PIS, KOKOM, $.]
14                 [PPOSAT, ART, NN, NE, VVPP, VAINF, $.]
15        [PWAV, ADJD, VAINF, ART, NN, ADJD, ART, NN, $.]
16      [PWAV, VMFIN, ART, PPOSAT, ART, PPER, NN, APPR...
17            

In [80]:
with open('Classification\\nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)

In [84]:
tag_bag = set(tag for index, row in data_set_frame.iterrows() for tag in row['PosTags'])
feature_set_pos= [([(tag in row['PosTags']) for tag in tag_bag], row[1]) for item, row in data_set_frame.iterrows()]

In [92]:
bag_frame_pos = pd.DataFrame(feature_set_pos)
x_pos, y_pos = bag_frame_pos.iloc[:,:-1],bag_frame_pos.iloc[:,-1]
x_frame_pos = pd.DataFrame(x_pos[0].tolist(), columns = tag_bag)
x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(x_frame_pos, y_pos, test_size=0.2)

In [94]:
xg_class_pos = xgb.XGBClassifier(max_depth=6, n_estimators=125, learning_rate=0.125, min_child_weight = 1, njobs=4)
xg_class_pos.fit(x_pos_train, y_pos_train)
preds = xg_class_pos.predict(x_pos_test)
accuracy_score(preds, y_pos_test)

0.8953140578265204

In [98]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.1,0.125, 1.5],
 'n_estimators':[100, 125, 150],
 'max_depth': [5, 6, 7]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.1,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_pos_train), np.array(y_pos_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.90895, std: 0.01659, params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100},
  mean: 0.90920, std: 0.01504, params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 125},
  mean: 0.90970, std: 0.01514, params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150},
  mean: 0.91094, std: 0.01250, params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100},
  mean: 0.91020, std: 0.01331, params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 125},
  mean: 0.91020, std: 0.01231, params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150},
  mean: 0.90920, std: 0.01129, params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100},
  mean: 0.90870, std: 0.01060, params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 125},
  mean: 0.90695, std: 0.01219, params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150},
  mean: 0.91194, std: 0.01447, params: {'learning_rate': 0.125, 'max_depth': 5, 'n_estimators': 100}

In [None]:
tag_ngrams_bag = set(gram for index, row in data_set_frame.iterrows() for gram in ngrams(row['PosTags'], 2))
feature_set_pos_ngrams = [([(gram in ngrams(row['PosTags'], 2)) for tag in tag_ngrams_bag], row[1])for item, row in data_set_frame.iterrows()]