# XGBoost Classifier

### 1. Module Import
***

In [3]:
import xgboost as xgb
import pandas as pd
import numpy as np
import Classification.config as cfg
import csv
import nltk
import re
import random
import warnings
import spacy
import pickle
import time
import operator
from nltk.util import ngrams
from copy import deepcopy
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 100)

SyntaxError: invalid syntax (<ipython-input-3-f93c9c091675>, line 4)


### 2.1 Defining the string cleaner

In [2]:
def clean(string):
    #return string
    clean_string = string.replace(u'\xa0', u' ')
    clean_string = re.sub(r'\d+', ' number ', clean_string)
    clean_string = re.sub(r'\n', ' ', clean_string)
    clean_string = re.sub(r'Ä', 'Ae', clean_string)
    clean_string = re.sub(r'ä', 'ae', clean_string)
    clean_string = re.sub(r'Ö', 'Oe', clean_string)
    clean_string = re.sub(r'ö', 'oe', clean_string)
    clean_string = re.sub(r'Ü', 'Ue', clean_string)
    clean_string = re.sub(r'ü', 'ue', clean_string)
    clean_string = re.sub(r'ß', 'ss', clean_string)
    clean_string = re.sub(r'°', ' Grad ', clean_string)
    clean_string = re.sub(r'24/7', 'immer', clean_string)
    clean_string = re.sub(r'/', ' ', clean_string)
    clean_string = re.sub(r'%', ' Prozent ', clean_string)
    clean_string = re.sub(r'[Zz][Bb]', 'zum Beispiel', clean_string)
    clean_string = re.sub(r'[Dd][Hh]', 'das heißt', clean_string)
    clean_string = re.sub(r'[Bb][Ss][Pp][Ww]', 'beispielsweise', clean_string)
    clean_string = re.sub(r'[Hh]allo', '', clean_string)
    clean_string = re.sub(r'[Hh]i', '', clean_string)
    clean_string = re.sub(r'[Hh]ey', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Mm]orgen', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Aa]bend', '', clean_string)
    
    clean_string = re.sub(r'(\([^)]*\))', ' ', clean_string)
    clean_string = re.sub(r'"', '', clean_string)
    clean_string = re.sub(r'\+', '', clean_string)
    clean_string = re.sub(r'-', '', clean_string)
    clean_string = re.sub(r',', ' ', clean_string)
    clean_string = re.sub(r'\^', '', clean_string)
    clean_string = re.sub(r'\'', '', clean_string)
    clean_string = re.sub(r'`', '', clean_string)
    clean_string = re.sub(r'´', '', clean_string)


    clean_string = re.sub(r'\'', '', clean_string)
    clean_string = re.sub(r'\.', '', clean_string)
    clean_string = re.sub(r'\s{2,}', ' ', clean_string)
    clean_string = re.sub(r'\s(?=\?)', ' ', clean_string)
    clean_string = re.sub(r'\?*(?=(?:\?))', '', clean_string)
    clean_string = clean_string.strip()
    #bitte, danke, und, eigentlich, überhaupt, git, wirklich
    return clean_string#.lower()


### 2.2 Building the lemmatizer

In [3]:
nlp = spacy.load('de')

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)


### 3.1 Importing the dataset

In [4]:
start_time = time.time()

data_set = []
data_set_index = pd.DataFrame(columns=['File Name', 'Start', 'End', 'Size'])
start_index = 0

for i, file in enumerate(cfg.ALL_FILES):
    file_count = len(cfg.ALL_FILES)
    reader = csv.reader(open(file, 'r'), delimiter=';')

    for line in reader:
        try:
            data_set.append([lemmatizer(clean(line[0])).lower(), line[1]])
        except Exception as e:
            print(e)
    
    print(('({0}/{1} Imported file {2}. Total length: {3})'.format(i + 1, file_count, file, len(data_set))), end='\r')
    
    data_set_index.loc[i] = [file, start_index, len(data_set) - 1, len(data_set) - 1 - start_index]
    start_index = len(data_set)

print('\n\nImport finished. Total length of data set: {}'.format(len(data_set)))

data_set_frame = pd.DataFrame(data_set, columns =['Feature', 'Label'])

print('Time taken:', time.time() - start_time)

(30/30 Imported file C:\Users\Josef\PycharmProjects\QC-Yes-No\Corpus\Tweets\questions\output_1000_tweets_2018-09-22_17-18-48\classified.csv. Total length: 5011)

Import finished. Total length of data set: 5011
Time taken: 35.0461630821228



### 3.2 Creating PosTag feature

In [5]:
start_time = time.time()

with open('Classification\\nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)
    
data_set_frame['PosTags'] = ''

for i, line in enumerate(data_set_frame['Feature']):
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(line)
    tag_line = []
    for tag in tagger.tag(sent):
        tag_line.append(tag[1])
    data_set_frame.at[i, 'PosTags'] = tag_line
    #print(('Tagged line {}'.format(i)), end='\r')

print('Time taken:', time.time() - start_time)

Time taken: 41.44835591316223



### 4. Building the feature bags

In [6]:
start_time = time.time()
words_bag = set(word for passage in data_set for word in nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]))
print('Created bag of words. Amount of words: {0}'.format(len(words_bag)))
ngrams_bag = set(gram for passage in data_set for gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]), 2))
print('Created bag of ngrams. Amount of ngrams: {0}'.format(len(ngrams_bag)))
tags_bag = set(tag for index, row in data_set_frame.iterrows() for tag in row['PosTags'])
print('Created bag of tags. Amount of tags: {0}'.format(len(tags_bag)))
print('Time taken:', time.time() - start_time)

Created bag of words. Amount of words: 5960
Created bag of ngrams. Amount of ngrams: 19582
Created bag of tags. Amount of tags: 45
Time taken: 0.3869509696960449


### 4.1 Building the evaluation function

In [7]:
with open('checked_list.pickle', 'rb') as f:
    checked_rows = pickle.load(f)

In [8]:
start_time = time.time()
def evaluate(test_frame, predictions):
    #test_frame = pd.DataFrame(y_com_test)
    test_frame_local = deepcopy(test_frame)
    test_frame_local.columns = ['Label']
    test_frame_local['Prediction'] = preds
    false_rows = []
    
    def find_dataset_location(index):
        found = data_set_index.loc[(data_set_index['Start'] <= index) & (data_set_index['End'] >= index)]
        found_info = found['File Name'].item().split('\\Corpus\\')[1]
        found_type, found_file = found_info.split('\\')[0], found_info.split('\\')[2]
        index_in_file = index - found['Start'].item() + 1
        return found_type, found_file, index_in_file
    
    
    for index, row in test_frame_local.iterrows():
        i=1
        if row['Prediction'] != row['Label']:
            i+=1
            loc = find_dataset_location(index)
            false_rows.append([index, data_set[index][0], row['Label'], row['Prediction'], loc[0], loc[1], loc[2]])
            
    false_rows.sort(key=operator.itemgetter(0))
    evaluation = pd.DataFrame(false_rows, columns=['Idx', 'Question', 'Label', 'Pred', 'Type', 'File', 'Line in File'])
    display('Found {0} possibly wrongly labeled questions.'.format(len(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])))
    display(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])
    checked_rows.update(evaluation['Idx'].values)

print('Time taken:', time.time() - start_time)

Time taken: 0.0



### 5.1 Creating words based feature set

In [11]:
start_time = time.time()
fs_words= [([(word in nltk.tokenize.WordPunctTokenizer().tokenize(data[0])) for word in words_bag], data[1]) for data in data_set]
bag_words = pd.DataFrame(fs_words)
print('Time taken:', time.time() - start_time)

Time taken: 141.10128474235535


### 5.2 Creating data points for training and evaluation

In [12]:
start_time = time.time()
x_word, y_word = bag_words.iloc[:,:-1],bag_words.iloc[:,-1]
x_frame_word = pd.DataFrame(x_word[0].tolist(), columns = words_bag)
x_word_train, x_word_test, y_word_train, y_word_test = train_test_split(x_frame_word, y_word, test_size=0.2)
print('Time taken:', time.time() - start_time)

Time taken: 1.4101691246032715


### 5.3 Training and evaluating the model

####          Naive Bayes

In [18]:
classifier = nltk.NaiveBayesClassifier.fit(x_train, y_train)
classifier.show_most_informative_features()
print('Naive Bayes: \n', nltk.classify.accuracy(classifier, test_set))

AttributeError: type object 'NaiveBayesClassifier' has no attribute 'fit'

#### Support Vector Machine

#### Decision Tree

#### k-nearest-Neigbors

#### XGBoost

In [13]:
start_time = time.time()

xg_class_word = xgb.XGBClassifier(max_depth=5, n_estimators=100, learning_rate=0.125, min_child_weight = 1, njobs=-1)
xg_class_word.fit(x_word_train, y_word_train)
preds = xg_class_word.predict(x_word_test)

print('Time taken:', time.time() - start_time)

display('Accuracy:', accuracy_score(preds, y_word_test))

Time taken: 80.75872254371643


0.9501495513459621

### 5.4 Evaluating possible problematic questions

In [17]:
evaluate(pd.DataFrame(y_word_test), preds)

'Found 9 possibly wrongly labeled questions.'

Unnamed: 0,Idx,Question,Label,Pred,Type,File,Line in File
3,1261,schaut der laptop aus wie auf der bild ?,1,0,Amazon,output_products_part_11,61
11,2188,haben mein auch ein schoenen sonnenauf oder untergang fotografieren ?,1,0,Tweets,output_1000_tweets_2018-08-21_17-02-43,204
17,2667,seh ich aus wie einen auto ?,1,0,Tweets,output_1000_tweets_2018-08-22_16-25-32,116
18,2710,weiss noch jemand was ich damit vor haben ?,1,0,Tweets,output_1000_tweets_2018-08-22_16-25-32,159
31,3740,fuehrt mein eigentlich einen tagebuch oder so etwas aehnliches ?,1,0,Tweets,output_1000_tweets_2018-09-10_23-44-03,11
33,3747,wissen mein mutti oder der ex davon ?,1,0,Tweets,output_1000_tweets_2018-09-10_23-44-03,18
37,4190,ich haben was gut zu tun als sich abholen ?,1,0,Tweets,output_1000_tweets_2018-09-11_13-29-40,223
38,4252,gibst du bescheid wann ich mit druecken aufhoeren können ?,1,0,Tweets,output_1000_tweets_2018-09-13_13-50-33,53
42,4473,oder sollen ich sich vielleicht gar nicht so nennen ?,1,0,Tweets,output_1000_tweets_2018-09-18_01-29-15,96


In [None]:
len(checked_rows)

In [None]:
with open('checked_list.pickle', 'wb') as f:
    pickle.dump(checked_rows, f, protocol=2)


### 6.1 Creating ngrams of words based feature set

In [None]:
fs_ngrams = [([(gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(data[0]), 2)) for gram in ngrams_bag], data[1]) for data in data_set]
bag_ngram = pd.DataFrame(fs_ngrams)

### 6.2 Creating data points for training and evaluation

In [None]:
x_ngram, y_ngram = bag_ngram.iloc[:,:-1],bag_ngram.iloc[:,-1]
x_frame_ngram = pd.DataFrame(x_ngram[0].tolist(), columns = ngrams_bag)
x_ngram_train, x_ngram_test, y_ngram_train, y_ngram_test = train_test_split(x_frame_ngram, y_ngram, test_size=0.2)

### 6.3 Training and evaluating the model

In [None]:
xg_class_ngram = xgb.XGBClassifier(max_depth=6, n_estimators=200, learning_rate=0.15, min_child_weight = 1, njobs=4)
xg_class_ngram.fit(x_ngram_train, y_ngram_train)
preds = xg_class_ngram.predict(x_ngram_test)
accuracy_score(preds, y_ngram_test)

### 7.1 Creating postags based feature set

In [None]:
with open('Classification\\nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)
    
data_set_frame = pd.DataFrame(data_set)
data_set_frame['PosTags'] = ''
for i, line in enumerate(data_set_frame[0]):
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(line)
    tag_line = []
    for tag in tagger.tag(sent):
        tag_line.append(tag[1])
    data_set_frame.at[i, 'PosTags'] = tag_line

In [None]:
tag_bag = set(tag for index, row in data_set_frame.iterrows() for tag in row['PosTags'])
fs_pos= [([(tag in row['PosTags']) for tag in tag_bag], row[1]) for item, row in data_set_frame.iterrows()]

In [None]:
bag_pos = pd.DataFrame(fs_pos)
x_pos, y_pos = bag_pos.iloc[:,:-1],bag_pos.iloc[:,-1]
x_frame_pos = pd.DataFrame(x_pos[0].tolist(), columns = tag_bag)
x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(x_frame_pos, y_pos, test_size=0.2)

In [None]:
xg_class_pos = xgb.XGBClassifier(max_depth=6, n_estimators=125, learning_rate=0.125, min_child_weight = 1, njobs=4)
xg_class_pos.fit(x_pos_train, y_pos_train)
preds = xg_class_pos.predict(x_pos_test)
accuracy_score(preds, y_pos_test)

In [None]:
x_frame_com = pd.concat([x_frame_pos, x_frame_word], axis=1)
x_com_train, x_com_test, y_com_train, y_com_test = train_test_split(x_frame_com, y_pos, test_size=0.2)

In [None]:
xg_class_com = xgb.XGBClassifier(max_depth=6, n_estimators=125, learning_rate=0.125, min_child_weight = 0.5, subsample = 0.5, njobs=4)
xg_class_com.fit(x_com_train, y_com_train)
preds = xg_class_com.predict(x_com_test)
accuracy_score(preds, y_com_test)

In [None]:
test_frame = pd.DataFrame(y_com_test)
test_frame.columns=['Label']
test_frame['Prediction']=preds
for index, row in test_frame.iterrows():
    if row['Prediction'] != row['Label']:
        print(data_set[index], 'Label:', row['Label'], 'Prediction:', row['Prediction'] )

In [None]:
preds
wrong = [(index, y_com_test[index])  for index in preds if preds[index] != y_com_test[index]]

In [None]:
wrong_2 = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
count = {}
all_lists = list(pd.DataFrame(data_set)[0])
test_string = ''

for e,i in enumerate(all_lists):
    test_string += ' '
    test_string += i
    if i.split().count('?') < 1:
        print(i, e)
test_string = test_string.split()

for word in words_bag:
    count[word] = test_string.count(word)
import operator
sorted_x = sorted(count.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
frame = pd.DataFrame(sorted_x)
frame.set_index(0, inplace=True)
display(frame)

In [None]:
data_set_frame

In [None]:
from copy import deepcopy
test_frame = deepcopy(data_set_frame)
test_frame['Feature_min'] = ''
for index, row in test_frame.iterrows():
    for word in row[0].rsplit():
        if frame[1][word] > 10:
            test_frame.at[index, 'Feature_min'] += (word + ' ')
        test_frame.at[index, 'Feature_min'] = test_frame.at[index, 'Feature_min']
    print(('Checked row: {}'.format(index)), end='\r')

In [None]:
words_bag_min = set(word for word in words_bag if frame[1][word] > 50)


In [None]:
test_frame

In [None]:
fs_words_min= [([(word in data[2].split()) for word in words_bag_min], data[1]) for index, data in test_frame.iterrows()]
bag_words_min = pd.DataFrame(fs_words_min)

In [None]:
x_word_min, y_word_min = bag_words_min.iloc[:,:-1],bag_words_min.iloc[:,-1]
x_frame_word_min = pd.DataFrame(x_word_min[0].tolist(), columns = words_bag_min)
x_word_min_train, x_word_min_test, y_word_min_train, y_word_min_test = train_test_split(x_frame_word_min, y_word_min, test_size=0.2)

In [None]:
xg_class_word_min = xgb.XGBClassifier(max_depth=7, n_estimators=100, learning_rate=0.125, min_child_weight = 0.4, subsample=0.7, njobs=4)
xg_class_word_min.fit(x_word_min_train, y_word_min_train)
preds = xg_class_word_min.predict(x_word_min_test)
accuracy_score(preds, y_word_min_test)

In [None]:
bag_words

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.125, 1.5],
 'n_estimators':[75, 100],
 'max_depth': [5, 6, 7],
 'min_child_weight': [0.5, 1],
 'subsample': [0.5, 1]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.125,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_word_min_train), np.array(y_word_min_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(xg_class_word, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
#fig, ax = plt.subplots(figsize=(30, 30))
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(xg_class_word, ax=ax)
plt.show()

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.125, 1.5],
 'n_estimators':[75, 100, 125],
 'max_depth': [4, 5, 6]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.1,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_word_train), np.array(y_word_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:


x, y = bag_ngram.iloc[:,:-1],bag_ngram.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = ngrams_bag)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2)#, random_state=20)

xg_class = xgb.XGBClassifier(max_depth=6, n_estimators=100, learning_rate=0.1, min_child_weight = 1, njobs=4)

In [None]:
xg_class.fit(x_train, y_train)
preds = xg_class.predict(x_test)
accuracy_score(preds, y_test)

In [None]:
fs_ngram = [([(gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(data[0]), 2)) for gram in ngrams_bag], data[1]) for data in data_set]

In [None]:
bag_ngram = pd.DataFrame(fs_ngram)

x, y = bag_ngram.iloc[:,:-1],bag_ngram.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = ngrams_bag)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2)#, random_state=20)

xg_class = xgb.XGBClassifier(max_depth=7, n_estimators=400, learning_rate=0.1, min_child_weight = 1, njobs=4)

In [None]:
xg_class.fit(x_train, y_train)
preds = xg_class.predict(x_test)
accuracy_score(preds, y_test)

In [None]:
bag_encoded = pd.DataFrame(fs_words)

x, y = bag_encoded.iloc[:,:-1],bag_encoded.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = words_bag)

from sklearn.model_selection import train_test_split
accuracy = []
for i in range(15):
    xg_class = xgb.XGBClassifier(max_depth=5, n_estimators=100, learning_rate=0.125, min_child_weight = 1, njobs=4)
    x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2, random_state=i)
    xg_class.fit(x_train, y_train)
    preds = xg_class.predict(x_test)
    accuracy.append(accuracy_score(preds, y_test))

In [None]:
sum(accuracy) / len(accuracy)

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.05, 0.1, 0.15],
 'n_estimators':[75, 100, 125],
 'max_depth': [6, 7],
 'min_child_weight': [1, 2]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.1,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_train), np.array(y_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
test['Pred']=preds

In [None]:
depth = [3,5, 10]
esti = [50, 100, 300, 1000]
learning_rate = [0.05, 0.2, 0.4]

for d in depth:
    for e in esti:
        for l in learning_rate:
            xg_class = xgb.XGBClassifier(max_depth=d, n_estimators=e, learning_rate=l)
            xg_class.fit(x_train, y_train)
            preds = xg_class.predict(x_test)
            print('Depth:', d, 'Estimators:', e, 'Learning Rate:', l, 'Accuracy', accuracy_score(preds, y_test))

In [None]:
len(preds)*937799043062201

In [None]:
wrong = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
wrong_2 = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
len(wrong_2)

In [None]:
for i, pred in wrong:
    print(i, data_set[i], 'pred:', pred)