# XGBoost Classifier

## 1 Module Import
***

In [2]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import Classification.config as cfg
import csv
import nltk
import re
import random
import warnings
import spacy
import pickle
import time
import operator
import gensim
import math

from copy import deepcopy
from collections import Counter

from scipy.sparse import csc_matrix
from xgboost import XGBClassifier
from ast import literal_eval
from nltk.util import ngrams
from textblob import TextBlob as tb
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from Classification.germalemma import GermaLemma
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 100)




## 3 Importing the dataset
***

In [3]:
data_set = pd.read_csv("C:\\Users\\Josef\\PycharmProjects\\QC-Yes-No\\Classification\\preprocessing\\data\\data_ready.csv", sep=';').drop('Unnamed: 0', 1)
data_set['Feature'] = data_set['Feature'].apply(lambda x: x.split('#'))
data_set['PosTags'] = data_set['PosTags'].apply(lambda x: x.split('#'))

In [229]:
with open('raw.csv', 'w', encoding='UTF-8', newline='') as f:
    writer = csv.writer(f)
    for _,line in data_set.iterrows():
        a = ' '.join(line['Feature'][:-1])
        writer.writerow([a, line['Label']])
    f.flush()
    f.close()

***

## 4.1 Defining the Classifier functions

### k-nearest Neighbors

__Default Parameters:__
 - 'algorithm': 'auto'
 - 'leaf_size': 30
 - 'metric': 'minkowski'
 - 'metric_params': None
 - 'n_jobs': 1
 - 'n_neighbors': 5
 - 'p': 2
 - 'weights': 'uniform'

In [5]:
def k_nearest(X_train, X_test, y_train, y_test, parameters = {}):
    kn_clf = KNeighborsClassifier(**parameters)
    kn_clf.fit(X_train, y_train)
    kn_accuracy = kn_clf.score(X_test, y_test)
    return kn_accuracy

### Naive Bayes

__Default Parameters:__
 - 'alpha': 1.0
 - 'class_prior': None
 - 'fit_prior': True

In [6]:
def naive_bayes(X_train, X_test, y_train, y_test, parameters = {}):
    nb_clf = MultinomialNB(**parameters)
    nb_clf.fit(X_train, y_train)
    nb_accuracy = nb_clf.score(X_test, y_test)
    return nb_accuracy

def naive_bayes_C(X_train, X_test, y_train, y_test, parameters = {}):
    nb_clf = ComplementNB(**parameters)
    nb_clf.fit(X_train, y_train)
    nb_accuracy = nb_clf.score(X_test, y_test)
    return nb_accuracy

def naive_bayes_B(X_train, X_test, y_train, y_test, parameters = {}):
    nb_clf = BernoulliNB(**parameters)
    nb_clf.fit(X_train, y_train)
    nb_accuracy = nb_clf.score(X_test, y_test)
    return nb_accuracy

### Decision Tree

__Default Parameters:__
 - 'class_weight': None
 - 'criterion': 'gini'
 - 'max_depth': None
 - 'max_features': None
 - 'max_leaf_nodes': None
 - 'min_impurity_decrease': 0.0
 - 'min_impurity_split': None
 - 'min_samples_leaf': 1
 - 'min_samples_split': 2
 - 'min_weight_fraction_leaf': 0.0
 - 'presort': False
 - 'random_state': None,
 - 'splitter': 'best'

featureparams = {'fs_words': {}, 
               'fs_ngrams': {}, 
               'fs_pos': {'max_depth': 8, 'min_samples_leaf': 2}, 
               'fs_words_min': {'min_samples_leaf': 3}, 
               'fs_bigrams_min': {}, 
               'fs_tfidf_words': {}, 
               'fs_words_ngrams': {}, 
               'fs_words_pos': {}, 
               'fs_min_pos': {'min_samples_leaf': 3}, 
               'fs_ngrams_pos': {}, 
               'fs_words_ngrams_pos': {}, 
               'fs_w2v': {'max_depth': 8}, 
               'fs_d2v': {'max_depth': 5, 'min_samples_leaf': 3}}

In [7]:
def decision_tree(X_train, X_test, y_train, y_test, parameters = {}):
    dt_clf = DecisionTreeClassifier(**parameters)
    dt_clf.fit(X_train, y_train)
    dt_accuracy = dt_clf.score(X_test, y_test)
    return dt_accuracy

### Random Forest

{'fs_words': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 20}, 
               'fs_ngrams': {'random_state': 1, 'n_estimators': 500}, 
               'fs_pos': {'random_state': 1, 'n_estimators': 500}, 
               'fs_words_min': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 20}, 
               'fs_bigrams_min': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 10}, 
               'fs_tfidf_words': {'random_state': 1, 'n_estimators': 500}, 
               'fs_words_ngrams': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 20}, 
               'fs_words_pos': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 20}, 
               'fs_min_pos': {'random_state': 1, 'n_estimators': 500}, 
               'fs_ngrams_pos': {'random_state': 1, 'n_estimators': 500}, 
               'fs_words_ngrams_pos': {'random_state': 1, 'n_estimators': 500}, 
               'fs_w2v': {'random_state': 1, 'n_estimators': 500, 'min_samples_split': 10}, 
               'fs_d2v': {'random_state': 1, 'n_estimators': 500}
                }

__Default Parameters:__
 - 'bootstrap': True 
 - 'class_weight': None 
 - 'criterion': 'gini' 
 - 'max_depth': None 
 - 'max_features': 'auto'
 - 'max_leaf_nodes': None
 - 'min_impurity_decrease': 0.0
 - 'min_impurity_split': None
 - 'min_samples_leaf': 1
 - 'min_samples_split': 2
 - 'min_weight_fraction_leaf': 0.0
 - 'n_estimators': 10 
 - 'n_jobs': 1
 - 'oob_score': False
 - 'random_state': None
 - 'verbose': 0
 - 'warm_start': False

In [8]:
def random_forest(X_train, X_test, y_train, y_test, parameters = {}):
    rf_clf = RandomForestClassifier(**parameters)
    rf_clf.fit(X_train, y_train)
    rf_accuracy = rf_clf.score(X_test, y_test)
    return rf_accuracy

### Support Vector Machine

__Default Parameters:__
 - 'C': 1.0
 - 'cache_size': 200
 - 'class_weight': None
 - 'coef0': 0.0
 - 'decision_function_shape': 'ovr'
 - 'degree': 3
 - 'gamma': 'auto'
 - 'kernel': 'rbf'
 - 'max_iter': -1
 - 'probability': False
 - 'random_state': None
 - 'shrinking': True
 - 'tol': 0.001
 - 'verbose': False

{'fs_words': {'random_state': 1, 'C': 0.5}, 
               'fs_ngrams': {'random_state': 1, 'C': 0.75}, 
               'fs_pos': {'random_state': 1, 'C': 1}, 
               'fs_words_min': {'random_state': 1, 'C': 0.5}, 
               'fs_bigrams_min': {'random_state': 1, 'C': 0.5}, 
               'fs_tfidf_words': {'random_state': 1, 'C': 0.5}, 
               'fs_words_ngrams': {'random_state': 1, 'C': 0.75}, 
               'fs_words_pos': {'random_state': 1, 'C': 0.5}, 
               'fs_min_pos': {'random_state': 1, 'C': 0.5}, 
               'fs_ngrams_pos': {'random_state': 1, 'C': 2}, 
               'fs_words_ngrams_pos': {'random_state': 1, 'C': 1}, 
               'fs_w2v': {'random_state': 1, 'C': 2}, 
               'fs_d2v': {'random_state': 1, 'C': 2}}

In [9]:
def SVM(X_train, X_test, y_train, y_test, parameters = {}):
    svm_clf = SVC(**parameters)
    svm_clf.fit(X_train, y_train)
    svm_accuracy = svm_clf.score(X_test, y_test)
    return svm_accuracy

### XG Boost

__Default Parameters:__
 - 'base_score': 0.5
 - 'booster': 'gbtree'
 - 'colsample_bylevel': 1
 - 'colsample_bytree': 
 - 'gamma': 0
 - 'learning_rate': 0.1
 - 'max_delta_step': 0
 - 'max_depth': 3, 
 - 'min_child_weight': 1
 - 'missing': None
 - 'n_estimators': 100
 - 'nthread': 1
 - 'objective': 'binary:logistic'
 - 'reg_alpha': 0
 - 'reg_lambda': 1
 - 'scale_pos_weight': 1
 - 'seed': 0
 - 'silent': 1 
 - 'subsample': 1

In [10]:
def XG_Boost(X_train, X_test, y_train, y_test, parameters = {}):
    xg_clf = XGBClassifier(**parameters)
    xg_clf.fit(X_train, y_train)
    xg_accuracy = xg_clf.score(X_test, y_test)
    return xg_accuracy

## Multi-layer Perceptron NN

solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(50, 2), random_state=1

In [11]:
def MLP(X_train, X_test, y_train, y_test, parameters = {}):
    mlp_clf = MLPClassifier(**parameters)
    mlp_clf.fit(X_train, y_train)
    mlp_accuracy = xg_clf.score(X_test, y_test)
    return xg_accuracy

### Manual rule based classifier

In [12]:
class manual_classifier(object):
    
    def __init__(self):
        pass
    
    
            
    def get_manual_label(self, row):
        if (row.loc[row.index[0], 'PosTags'][0].startswith('V') or 'oder' in row.loc[row.index[0], 'Feature'][-2:]) \
                    and not 'oder' in row.loc[row.index[0], 'Feature'][:-2] \
                    and not 'jemand' in row.loc[row.index[0], 'Feature'][:-2] \
                    and not 'wer' in row.loc[row.index[0], 'Feature'] \
                    and not 'was' in row.loc[row.index[0], 'Feature'] \
                    and not 'welche' in row.loc[row.index[0], 'Feature'] \
                    and not 'welchem' in row.loc[row.index[0], 'Feature'] \
                    and not 'wieso' in row.loc[row.index[0], 'Feature'] \
                    and not 'wo' in row.loc[row.index[0], 'Feature'] \
                    and not 'warum' in row.loc[row.index[0], 'Feature'] \
                    and not 'wie' in row.loc[row.index[0], 'Feature'] \
                    and not 'wann' in row.loc[row.index[0], 'Feature']:
            return 1
        else:
            return 0 
    
    def classify(self, data):
        label = []
        for i in data.index.values.tolist():
            label.append(self.get_manual_label(data.loc[[i]]))
        return(label)
    
    def accuracy(self, data):
        labels = self.classify(data)
        false_positive = 0
        false_negative = 0
        total_pos = data['Label'].values.tolist().count(1)
        found_pos = labels.count(1)
        for index, value in enumerate(labels):
            if value != data.loc[index, 'Label']:
                if value == 1:
                    false_positive += 1
                else:
                    false_negative += 1
                    print(index, data.loc[index, 'Feature'], value, data.loc[index, 'Label'])

        print('Accuracy:', (len(labels) - (false_positive + false_negative)) / len(labels))
        print('False Positive: ', false_positive / len(labels))
        print('False Negative: ', false_negative / len(labels))
        print('Precision: ', (found_pos - false_positive) / found_pos)
        print('Recall: ', (found_pos - false_positive) / total_pos)
        return((len(labels) - (false_positive + false_negative)) / len(labels))

### Testing function

In [13]:
def test_all(featureset, runs=5, test_size=0.2, f_col='Feature'):
    algs =  {'k_nearest' : {'n_jobs' : -1}, 
             'naive_bayes': {}, 
             'random_forest': {'n_estimators': 100, 'n_jobs': -1, 'min_samples_split': 10},
             'decision_tree': {},
             'SVM': {'kernel' : 'linear'}, 
             'XG_Boost' : {'max_depth' : 5, 'n_estimators' : 125, 'learning_rate' : 0.1, 'min_child_weight' : 1, 'njobs' : -1}, 
             'MLP': {'solver': 'lbfgs', alpha: 1e-5, hidden_layer_sizes: (50, 2), random_state: 1}}

    for alg in algs:
        get_average_accuracy(alg, runs, featureset, test_size, algs[alg], f_col=f_col)

## 4.2 Defining the train-test split function

In [14]:
def get_train_test(feature_set, test_size=0.2, f_col='Feature', l_col='Label', random_state=None):
    X, y = pd.DataFrame(feature_set[f_col]), pd.DataFrame(feature_set[l_col])
    X = np.array(list(X[f_col]))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

## 4.3 Defining the average accuracy function

In [13]:
def get_average_accuracy(function_name, iteration_amount, data_set, test_size, parameters, f_col='Feature', l_col='Label'):
    results = []
    start_time = time.time()

    for i in range(iteration_amount):      
        X_train, X_test, y_train, y_test = get_train_test(data_set, test_size, f_col, l_col)
        results.append(globals()[function_name](X_train, X_test, y_train, y_test, parameters))
        
        print("'Evaluated {0} questions with {1}. Average accuracy: {2}. Time taken: {3}".format(i + 1, function_name, sum(results) / (i + 1), time.time() - start_time), end='\r')
    print('\n')
    return sum(results) / iteration_amount

## 4.4 Defining the problematic question evaluation function

In [None]:
def find_dataset_location(index):
        found = data_set_index.loc[(data_set_index['Start'] <= index) & (data_set_index['End'] >= index)]
        found_info = found['File Name'].item().split('\\Corpus\\')[1]
        found_type, found_file = found_info.split('\\')[0], found_info.split('\\')[2]
        index_in_file = index - found['Start'].item() + 1
        return found_type, found_file, index_in_file
    
def evaluate(test_frame, predictions):
    with open('checked_list.pickle', 'rb') as f:
        checked_rows = pickle.load(f)
    
    test_frame_local = deepcopy(test_frame)
    test_frame_local.columns = ['Label']
    test_frame_local['Prediction'] = preds
    false_rows = []
    
    
    
    
    for index, row in test_frame_local.iterrows():
        i=1
        if row['Prediction'] != row['Label']:
            i+=1
            loc = find_dataset_location(index)
            false_rows.append([index, data_set[index][0], row['Label'], row['Prediction'], loc[0], loc[1], loc[2]])
            
    false_rows.sort(key=operator.itemgetter(0))
    evaluation = pd.DataFrame(false_rows, columns=['Idx', 'Question', 'Label', 'Pred', 'Type', 'File', 'Line in File'])
    display('Found {0} possibly wrongly labeled questions.'.format(len(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])))
    display(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])
    checked_rows.update(evaluation['Idx'].values)

    with open('checked_list.pickle', 'wb') as f:
        pickle.dump(checked_rows, f, protocol=2)

In [15]:
def create_pickle(data, name):
    with open('pickles/' + name + '.pickle', 'wb') as f:
        pickle.dump(data, f, protocol=2)

def load_pickle(name):
    with open('pickles/' + name + '.pickle', 'rb') as f:
        return pickle.load(f)

***

# 5 Bag of words


## 5.1 Creating words based feature set

In [5]:
bag_words = set(word for row in data_set['Feature'] for word in row)
print('Created bag of words. Amount of words: {0}'.format(len(bag_words)))

Created bag of words. Amount of words: 5717


In [181]:
%%time

fs_words = pd.DataFrame([([(word in data['Feature']) for word in bag_words], data['Label']) for _, data in data_set.iterrows()], columns=['Feature', 'Label'])
print('Encoded bag of words feature set.')

Created bag of words. Amount of words: 5717
Encoded bag of words feature set.
Time taken: 311.83122873306274


In [185]:
create_pickle(fs_words, 'fs_words')

In [13]:
fs_words = load_pickle('fs_words')

## 5.2 Getting data points for training and evaluation

In [115]:
X_word_train, X_word_test, y_word_train, y_word_test = get_train_test(fs_words, 0.2)

# 6 Bag of ngrams


## 6.1 Creating bigrams of words based feature set

In [14]:
bag_ngrams = set(gram for row in data_set['Feature'] for gram in ngrams(row, 2))
print('Created bag of ngrams. Amount of ngrams: {0}'.format(len(bag_ngrams)))

Created bag of ngrams. Amount of ngrams: 18922


In [192]:
%%time
fs_ngrams= pd.DataFrame([([gram in ngrams(data['Feature'], 2) for gram in bag_ngrams], data['Label']) for _, data in data_set.iterrows()], columns=['Feature', 'Label'])
print('Encoded bag of ngrams feature set.')

Created bag of ngrams. Amount of ngrams: 18922
Encoded bag of ngrams feature set.
Time taken: 1407.278207540512


In [194]:
create_pickle(fs_ngrams, 'fs_ngrams')

In [15]:
fs_ngrams = load_pickle('fs_ngrams')

## 6.2 Getting data points for training and evaluation

In [16]:
X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = get_train_test(fs_ngrams, 0.2)

Time taken: 0.0019922256469726562


# 7 Part-of-Speech Tags

## 7.1 Creating postags based feature set

In [16]:
bag_tags = set(tag for row in data_set['PosTags'] for tag in row)
bag_tags_bigrams = set(tag for gram  in ngrams(data_set['PosTags'], 2) for tag in row)
bag_tags_trigrams = set(tag for gram  in ngrams(data_set['PosTags'], 3) for tag in row)
print('Created bag of PosTags. Amount of tags: {0}'.format(len(bag_tags)))

Created bag of PosTags. Amount of tags: 47


In [17]:
%%time

fs_pos = pd.DataFrame([([(tag in row['PosTags']) for tag in bag_tags], row['Label']) for _, row in data_set.iterrows()], columns=['Feature', 'Label'])
fs_pos_bigram = pd.DataFrame([([(tag in ngrams(row['PosTags'], 2)) for tag in bag_tags_bigrams], row['Label']) for _, row in data_set.iterrows()], columns=['Feature', 'Label'])
fs_pos_trigrams = pd.DataFrame([([(tag in ngrams(row['PosTags'], 3)) for tag in bag_tags_trigrams], row['Label']) for _, row in data_set.iterrows()], columns=['Feature', 'Label'])
print('Encoded bag of PosTags feature set.')

Created bag of PosTags. Amount of tags: 47
Encoded bag of PosTags feature set.
Time taken: 2.9926722049713135


In [189]:
create_pickle(fs_pos, 'fs_pos')

In [17]:
fs_pos = load_pickle('fs_pos')

## 7.2 Getting data points for training and evaluation

In [15]:
X_pos_train, X_pos_test, y_pos_train, y_pos_test = get_train_test(fs_pos, 0.2)

***
## 8 Combining features

### 8.1 Creating feature count for words with minimum count

#### Setting a minimum frequency for words

In [None]:
%%time
min_word_count = 3


sents = data_set['Feature'].values.tolist()
sent_merged = [j for i in sents for j in i]
word_count = Counter(sent_merged)

bag_words_min = set(word for word in bag_words if word_count[word] >= min_word_count)

min_word_frame = deepcopy(data_set)
min_word_frame['Feature_min'] = ''

for index, row in min_word_frame.iterrows():
    f = []
    for word in row['Feature']:
        if word_count[word] >= min_word_count:
            f.append(word)
    min_word_frame.at[index, 'Feature_min'] = f
    
fs_words_min = pd.DataFrame([([(word in data['Feature_min']) for word in bag_words_min], data['Label']) for _, data in min_word_frame.iterrows()], columns=['Feature', 'Label'])

In [90]:
test_all(fs_words_min)

'Evaluated 5 questions with k_nearest. Average accuracy: 0.8163509471585245. Time taken: 10.457966804504395

'Evaluated 5 questions with naive_bayes. Average accuracy: 0.8743768693918245. Time taken: 0.62313437461853034

'Evaluated 5 questions with random_forest. Average accuracy: 0.9373878364905284. Time taken: 5.6189074516296394

'Evaluated 5 questions with decision_tree. Average accuracy: 0.9347956131605184. Time taken: 4.7906150817871092

'Evaluated 5 questions with SVM. Average accuracy: 0.9457627118644067. Time taken: 32.224030256271368

'Evaluated 5 questions with XG_Boost. Average accuracy: 0.9443668993020937. Time taken: 187.94302034378052



#### Setting a minimum frequency for bigrams

In [198]:
bigram_count = {}
min_bigram_count = 2
bigrams = [j for i in sents for j in ngrams(i, 2)]
bigram_count = Counter(bigrams)

#bag_ngrams_min = set(bigram for bigram in bag_ngrams if bigram_count[(bigram[0], bigram[1])] >= min_bigram_count)



bag_ngrams_min = set(gram for row in min_word_frame['Feature_min'] for gram in ngrams(row, 2))

min_bigram_frame = deepcopy(data_set)
min_bigram_frame['Feature_min'] = ''

for index, row in min_bigram_frame.iterrows():
    f = []
    for i, bigram in enumerate(ngrams(row['Feature'], 2)):
        if bigram_count[(bigram[0], bigram[1])] >= min_bigram_count:
            f.append(bigram)
    min_bigram_frame.at[index, 'Feature_min'] = f
    


In [200]:
%%time
fs_bigrams_min = pd.DataFrame([([gram in data['Feature_min'] for gram in bag_ngrams_min], data['Label']) for _, data in min_bigram_frame.iterrows()], columns=['Feature', 'Label'])

Wall time: 9min 46s


In [163]:
clf = KMeans(n_clusters=2)
clf.fit(X_word_train)
pred = clf.predict(X_word_test)

In [164]:
wrong = 0
for i, x in enumerate(pred):
    if y_word_test['Label'].values.tolist()[i] != x:
        wrong += 1
print((len(pred) - wrong) / len(pred))
print(wrong)

0.5174476570289133
484


### 8.2 Creating combined features

#### Word + pos

In [284]:
fs_words_pos = deepcopy(fs_pos)
fs_words_pos['Feature'] = fs_words['Feature'] + fs_word_pos['Feature']

In [212]:
test_all(fs_word_pos, runs=2,  f_col='Combined')

'Evaluated 2 questions with k_nearest. Average accuracy: 0.8773678963110667. Time taken: 23.500316619873047

'Evaluated 2 questions with naive_bayes. Average accuracy: 0.9027916251246262. Time taken: 1.0892376899719238

'Evaluated 2 questions with random_forest. Average accuracy: 0.9466600199401795. Time taken: 8.268470287322998

'Evaluated 2 questions with decision_tree. Average accuracy: 0.9506480558325026. Time taken: 5.6662356853485116

'Evaluated 2 questions with SVM. Average accuracy: 0.9586241276171485. Time taken: 49.330788135528564

'Evaluated 2 questions with XG_Boost. Average accuracy: 0.9675972083748754. Time taken: 196.55791854858398



#### Word min + pos

In [283]:
fs_min_pos = deepcopy(fs_pos)
fs_min_pos['Feature'] = fs_words_min['Feature'] + fs_min_pos['Feature']

#### Word + bigram

In [282]:
fs_words_ngrams = deepcopy(fs_ngrams)
fs_ngrams_pos['Feature'] = fs_words['Feature'] + fs_ngrams['Feature']

#### Bigram + pos

In [281]:
fs_ngrams_pos = deepcopy(fs_ngrams)
fs_ngrams_pos['Feature'] = fs_ngrams['Feature'] + fs_min_pos['Feature']

#### Word + bigram + pos

In [280]:
fs_words_ngrams_pos = deepcopy(fs_ngrams)
fs_words_ngrams_pos['Feature'] = fs_words['Feature'] + fs_ngrams['Feature'] + fs_pos['Feature']

### Transforming the featureset to TfIdf

In [248]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [249]:
%%time
tfidf_frame = pd.DataFrame(index=range(len(data_set)), columns=bag_words).fillna(0.0)
data_set_sent = [' '.join(data['Feature']) for _, data in data_set.iterrows()]

bloblist = [tb(i) for i in data_set_sent]
tfidf_list = []
for i, blob in enumerate(bloblist):
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    for word in scores:
        tfidf_frame.at[i, word] = round(scores[word], 5)
    tfidf_list.append(scores)

tfidf_values = tfidf_frame.values.tolist()
fs_tfidf_words = deepcopy(fs_words)
fs_tfidf_words['Feature'] = tfidf_values
data_set['TfIdf'] = tfidf_list


Wall time: 52.6 s


### Word2Vec feature

#### Loading the model

In [244]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('german.model', binary=True,)
print('Loaded word2vec model.')

Loaded word2vec model.


#### Defining a function that returns an array of vectors of a given line of text

In [245]:
def get_w2v(word):
    try:
        vec = w2v_model.get_vector(word)
        return np.round(vec, 8)
    except:
        vec = np.zeros(shape=(1, 300))
        return vec[0]

def get_vectors(text):
    vectors = {}
    for word in text[:-1]:
            vec[i] = get_w2v(word)
    return vectors

#### Creating the featureset using a weighted average (TfIdf) of the phrase vectors

In [251]:
data_set_w2v = deepcopy(data_set)
data_set_w2v['Word2Vec'] = ''

bag_w2v = {}
for word in bag_words:
    bag_w2v[word] = get_w2v(word)

In [285]:
for i, row in data_set_w2v.iterrows():
    avg_vecs = []
    word_len = 0
    for j, word in enumerate(row['Feature'][:-1]):
        vec = bag_w2v[word]
        if not sum(vec) == 0:
            word_len += 1
            
        avg_vecs.append(list(vec * row['TfIdf'][word]))
        
    if word_len > 0:
        data_set_w2v.at[i, 'Word2Vec'] = np.array(avg_vecs).sum(axis=0) / word_len
    else: 
        data_set_w2v.at[i, 'Word2Vec'] = np.array(avg_vecs).sum(axis=0)
        
w2v_values = list(np.round(data_set_w2v['Word2Vec'].values.tolist(), 6))
fs_w2v = deepcopy(fs_words)
fs_w2v['Feature'] = w2v_values

In [212]:
test_all(fs_w2v, 'Word2Vec')

'Evaluated 10 questions with k_nearest. Average accuracy: 0.639581256231306. Time taken: 4.5955443382263185

'Evaluated 10 questions with random_forest. Average accuracy: 0.6959122632103688. Time taken: 7.229034662246704

'Evaluated 10 questions with decision_tree. Average accuracy: 0.5897308075772681. Time taken: 9.908966064453125

'Evaluated 10 questions with SVM. Average accuracy: 0.702791625124626. Time taken: 40.528583526611332

'Evaluated 10 questions with XG_Boost. Average accuracy: 0.7082751744765703. Time taken: 163.2045726776123



## DOC2VEC

In [253]:
def read_corpus(x, tokens_only=False):
    for i, line in x.iterrows():
        if tokens_only:
            yield gensim.models.doc2vec.TaggedDocument(line['Feature'])
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(line['Feature'], tags=[line['Label']])

In [254]:
shuffled = data_set.sample(frac=1)
d_train, d_test = shuffled.head(int(len(shuffled) * 0.8)), shuffled.tail(len(shuffled) - int(len(shuffled) * 0.8))
train_corpus = list(read_corpus(d_train))
test_corpus = list(read_corpus(d_test))

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=40)
model.build_vocab(train_corpus)
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Wall time: 4.42 s


In [286]:
labels = []
for line, idx in train_corpus:
    inferred_vector = model.infer_vector(line)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    pred_label = []
    for i, value in sims[:1]:
        pred_label.append(data_set.loc[i, 'Label'])
    labels.append([int(sum(pred_label) / len(pred_label)), int(data_set.loc[idx, 'Label'])])

wrong = 0
total = len(labels)
for x,y in labels:
    if x != y:
        wrong += 1
print(1 - wrong/total)

fs_d2v = deepcopy(data_set)
for i, row in data_set_d2v.iterrows():
    fs_d2v.at[i, 'Feature'] = model.infer_vector(row['Feature'])

0.9206586826347305


In [605]:
test_all(data_set_d2v, f_col='Doc2Vec')

'Evaluated 10 questions with XG_Boost. Average accuracy: 0.8674975074775674. Time taken: 127.69767332077026



In [55]:
fs = fs_words
runs = 100
get_average_accuracy('naive_bayes', runs, fs, 0.2, {})
get_average_accuracy('naive_bayes_x', runs, fs, 0.2, {})
get_average_accuracy('naive_bayes_y', runs, fs, 0.2, {})


'Evaluated 100 questions with naive_bayes. Average accuracy: 0.8621036889332. Time taken: 55.35055494308472866

'Evaluated 100 questions with naive_bayes_x. Average accuracy: 0.8616849451645063. Time taken: 55.67763805389404

'Evaluated 100 questions with naive_bayes_y. Average accuracy: 0.8686440677966101. Time taken: 62.08309197425842



0.8686440677966101

In [258]:
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.125, 1.5],
 'n_estimators':[75, 100],
 'max_depth': [5, 6, 7],
 'min_child_weight': [0.5, 1],
 'subsample': [0.5, 1]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.125,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_word_min_train), np.array(y_word_min_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

ModuleNotFoundError: No module named 'sklearn.grid_search'

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(xg_class_word, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
#fig, ax = plt.subplots(figsize=(30, 30))
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(xg_class_word, ax=ax)
plt.show()

In [230]:
fs_ngram_pos = 

'Evaluated 5 questions with k_nearest. Average accuracy: 0.8167497507477567. Time taken: 59.298043251037686

'Evaluated 5 questions with naive_bayes. Average accuracy: 0.8528414755732803. Time taken: 2.6805853843688965

'Evaluated 5 questions with random_forest. Average accuracy: 0.9335992023928215. Time taken: 31.888021230697632

'Evaluated 5 questions with decision_tree. Average accuracy: 0.9405782652043868. Time taken: 21.057191371917725

'Evaluated 5 questions with SVM. Average accuracy: 0.9517447657028913. Time taken: 189.21160888671875

'Evaluated 5 questions with XG_Boost. Average accuracy: 0.9397806580259223. Time taken: 467.11976194381714



In [16]:
fs_words = load_pickle('featuresets/fs_words')
fs_ngrams = load_pickle('featuresets/fs_ngrams')
fs_pos = load_pickle('featuresets/fs_pos') 
fs_words_min = load_pickle('featuresets/fs_words_min')
fs_ngrams_min = load_pickle('featuresets/fs_ngrams_min')
fs_tfidf_words = load_pickle('featuresets/fs_tfidf_words')
fs_words_ngrams = load_pickle('featuresets/fs_words_ngrams')
fs_words_pos = load_pickle('featuresets/fs_words_pos')
fs_words_min_pos = load_pickle('featuresets/fs_min_pos')
fs_ngrams_pos = load_pickle('featuresets/fs_ngrams_pos')
fs_words_ngrams_pos = load_pickle('featuresets/fs_words_ngrams_pos')
fs_w2v = load_pickle('featuresets/fs_w2v')
fs_d2v = load_pickle('featuresets/fs_d2v')

In [120]:
X_train, X_test, y_train, y_test = get_train_test(fs_words, random_state = 1)

In [121]:
import tensorflow as tf
from tensorflow import keras

In [122]:
model = keras.Sequential([
    keras.layers.Dense(5717, activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [124]:
model.fit(np.array(X_train), np.array(y_train), epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5




<tensorflow.python.keras.callbacks.History at 0x1430480b908>

In [125]:
test_loss, test_acc = model.evaluate(X_test, y_test)




In [126]:
test_loss

0.31536366303979696

In [127]:
test_acc

0.9122632106660251

In [81]:
%%time
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(500, 2), random_state=1)
clf.fit(csc_matrix(X_train), y_train)
print(clf.score(X_test, y_test))

0.9312063808574277
Wall time: 54.3 s


In [93]:
clf.score(X_test, y_test)

ValueError: Found array with 0 sample(s) (shape=(0, 200)) while a minimum of 1 is required.


    'base_score': 0.5
    'booster': 'gbtree'
    'colsample_bylevel': 1
    'colsample_bytree':
    'gamma': 0
    'learning_rate': 0.1
    'max_delta_step': 0
    'max_depth': 3,
    'min_child_weight': 1
    'missing': None
    'n_estimators': 100
    'nthread': 1
    'objective': 'binary:logistic'
    'reg_alpha': 0
    'reg_lambda': 1
    'scale_pos_weight': 1
    'seed': 0
    'silent': 1
    'subsample': 1



In [107]:
%%time


featuresets = (fs_words, fs_ngrams, fs_pos, fs_words_min, fs_ngrams_min, fs_tfidf_words, fs_words_ngrams, fs_words_pos, fs_words_min_pos, fs_ngrams_pos, fs_words_ngrams_pos, fs_w2v, fs_d2v)
featureparams = {  'fs_words': {'random_state': 1}, 
               
                  #fs_ngrams': {'random_state': 1}, 
                   #'fs_pos': {'random_state': 1}, 
                   #'fs_words_min': {'random_state': 1}, 
                   #'fs_ngrams_min': {'random_state': 1}, 
                   #'fs_tfidf_words': {'random_state': 1}, 
                   #'fs_words_ngrams': {'random_state': 1}, 
                   #'fs_words_pos': {'random_state': 1}, 
                   #'fs_words_min_pos': {'random_state': 1}, 
                   #'fs_ngrams_pos': {'random_state': 1}, 
                   #'fs_words_ngrams_pos':  {'solver': 'lbfgs', 'random_state': 1}, 
                   #'fs_w2v': {'random_state': 1}, 
                   #'fs_d2v': {'random_state': 1}
                    }

scores = []
for fs in featuresets:
    name = [k for k,v in locals().items() if v is fs][0]
    if not featureparams[name]:
            continue
    X_train, X_test, y_train, y_test = get_train_test(fs, test_size=0, random_state=1)
    try:
        param_test1 =  {
            'hidden_layer_sizes': [(5717, 2)],
        }
        gsearch1 = GridSearchCV(estimator = MLPClassifier(**featureparams[name]), 
        param_grid = param_test1, n_jobs=1,iid=False, cv=2)
        gsearch1.estimator.get_params()
        gsearch1.fit(csc_matrix(np.array(X_train)), np.array(y_train))
        print(name, gsearch1.best_params_, gsearch1.best_score_)
        scores.append(pd.DataFrame(gsearch1.cv_results_))
    except Exception as e:
        print(e)
        print('Skipped', name)
        continue
scores = pd.concat(scores)


KeyboardInterrupt: 

In [102]:
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_hidden_layer_sizes,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,13.937604,1.582694,0.20406,0.192057,"(100, 100, 100)","{'hidden_layer_sizes': (100, 100, 100)}",0.893057,0.893812,0.893435,0.000378,2,1.0,0.999601,0.9998,0.0002
1,18.210455,2.887299,0.007502,0.0005,"(100, 100, 2)","{'hidden_layer_sizes': (100, 100, 2)}",0.899042,0.88982,0.894431,0.004611,1,1.0,0.999601,0.9998,0.0002
2,17.822761,0.183867,0.007002,0.001,"(100, 100)","{'hidden_layer_sizes': (100, 100)}",0.892658,0.88982,0.891239,0.001419,3,1.0,0.999601,0.9998,0.0002


In [105]:
len(fs_words['Feature'][0])

5717