# XGBoost Classifier

## 1 Module Import
***

In [109]:
import sys
sys.path.append('..')

import xgboost as xgb
import pandas as pd
import numpy as np
import Classification.config as cfg
import csv
import nltk
import re
import random
import warnings
import spacy
import pickle
import time
import operator
from nltk.util import ngrams
from copy import deepcopy
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import neighbors, svm
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 100)

## 2.1 Defining the string cleaner
***

In [9]:
def clean(string):
    #return string
    clean_string = string.replace(u'\xa0', u' ')
    clean_string = re.sub(r'\d+', ' number ', clean_string)
    clean_string = re.sub(r'\n', ' ', clean_string)
    clean_string = re.sub(r'Ä', 'Ae', clean_string)
    clean_string = re.sub(r'ä', 'ae', clean_string)
    clean_string = re.sub(r'Ö', 'Oe', clean_string)
    clean_string = re.sub(r'ö', 'oe', clean_string)
    clean_string = re.sub(r'Ü', 'Ue', clean_string)
    clean_string = re.sub(r'ü', 'ue', clean_string)
    clean_string = re.sub(r'ß', 'ss', clean_string)
    clean_string = re.sub(r'°', ' Grad ', clean_string)
    clean_string = re.sub(r'24/7', 'immer', clean_string)
    clean_string = re.sub(r'/', ' ', clean_string)
    clean_string = re.sub(r'%', ' Prozent ', clean_string)
    clean_string = re.sub(r'[Zz][Bb]', 'zum Beispiel', clean_string)
    clean_string = re.sub(r'[Dd][Hh]', 'das heißt', clean_string)
    clean_string = re.sub(r'[Bb][Ss][Pp][Ww]', 'beispielsweise', clean_string)
    clean_string = re.sub(r'[Hh]allo', '', clean_string)
    clean_string = re.sub(r'[Hh]i', '', clean_string)
    clean_string = re.sub(r'[Hh]ey', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Mm]orgen', '', clean_string)
    clean_string = re.sub(r'[Gg]uten\s[Aa]bend', '', clean_string)
    
    clean_string = re.sub(r'(\([^)]*\))', ' ', clean_string)
    clean_string = re.sub(r'"', '', clean_string)
    clean_string = re.sub(r'\+', '', clean_string)
    clean_string = re.sub(r'-', '', clean_string)
    clean_string = re.sub(r',', ' ', clean_string)
    clean_string = re.sub(r'\^', '', clean_string)
    clean_string = re.sub(r'\'', '', clean_string)
    clean_string = re.sub(r'`', '', clean_string)
    clean_string = re.sub(r'´', '', clean_string)


    clean_string = re.sub(r'\'', '', clean_string)
    clean_string = re.sub(r'\.', '', clean_string)
    clean_string = re.sub(r'\s{2,}', ' ', clean_string)
    clean_string = re.sub(r'\s(?=\?)', ' ', clean_string)
    clean_string = re.sub(r'\?*(?=(?:\?))', '', clean_string)
    clean_string = clean_string.strip()
    #bitte, danke, und, eigentlich, überhaupt, git, wirklich
    return clean_string#.lower()



## 2.2 Building the lemmatizer
***

In [10]:
nlp = spacy.load('de')

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)



## 3.1 Importing the dataset
***

In [11]:
start_time = time.time()

data_set = []
data_set_index = pd.DataFrame(columns=['File Name', 'Start', 'End', 'Size'])
start_index = 0

for i, file in enumerate(cfg.ALL_FILES):
    file_count = len(cfg.ALL_FILES)
    reader = csv.reader(open(file, 'r'), delimiter=';')

    for line in reader:
        try:
            data_set.append([lemmatizer(clean(line[0])).lower(), line[1]])
        except Exception as e:
            print(e)
    
    print(('({0}/{1} Imported file {2}. Total length: {3})'.format(i + 1, file_count, file, len(data_set))), end='\r')
    
    data_set_index.loc[i] = [file, start_index, len(data_set) - 1, len(data_set) - 1 - start_index]
    start_index = len(data_set)

print('\n\nImport finished. Total length of data set: {}'.format(len(data_set)))

data_set_frame = pd.DataFrame(data_set, columns =['Feature', 'Label'])

print('Time taken:', time.time() - start_time)

(30/30 Imported file C:\Users\Josef\PycharmProjects\QC-Yes-No\Corpus\Tweets\questions\output_1000_tweets_2018-09-22_17-18-48\classified.csv. Total length: 5011)

Import finished. Total length of data set: 5011
Time taken: 37.56443214416504



## 3.2 Creating PosTag feature
***

In [14]:
start_time = time.time()

with open('..\\Classification\\nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)
    
data_set_frame['PosTags'] = ''

for i, line in enumerate(data_set_frame['Feature']):
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(line)
    tag_line = []
    for tag in tagger.tag(sent):
        tag_line.append(tag[1])
    data_set_frame.at[i, 'PosTags'] = tag_line
    #print(('Tagged line {}'.format(i)), end='\r')

print('Time taken:', time.time() - start_time)

Time taken: 41.63926982879639


## 4.1 Defining the Classifier functions

### k-nearest Neighbors

__Default Parameters:__
 - 'algorithm': 'auto'
 - 'leaf_size': 30
 - 'metric': 'minkowski'
 - 'metric_params': None
 - 'n_jobs': 1
 - 'n_neighbors': 5
 - 'p': 2
 - 'weights': 'uniform'

In [87]:
def k_nearest(X_train, X_test, y_train, y_test, parameters = {}):
    kn_clf = neighbors.KNeighborsClassifier(**parameters)
    kn_clf.fit(X_word_train, y_word_train)
    kn_accuracy = kn_clf.score(X_word_test, y_word_test)
    return kn_accuracy

### Naive Bayes

__Default Parameters:__
 - 'alpha': 1.0
 - 'class_prior': None
 - 'fit_prior': True

In [165]:
def naive_bayes(X_train, X_test, y_train, y_test, parameters = {}):
    nb_clf = MultinomialNB(**parameters)
    nb_clf.fit(X_word_train, y_word_train)
    nb_accuracy = nb_clf.score(X_word_test, y_word_test)
    return nb_accuracy

### Decision Tree

__Default Parameters:__
 - 'class_weight': None
 - 'criterion': 'gini'
 - 'max_depth': None
 - 'max_features': None
 - 'max_leaf_nodes': None
 - 'min_impurity_decrease': 0.0
 - 'min_impurity_split': None
 - 'min_samples_leaf': 1
 - 'min_samples_split': 2
 - 'min_weight_fraction_leaf': 0.0
 - 'presort': False
 - 'random_state': None,
 - 'splitter': 'best'

In [166]:
def decision_tree(X_train, X_test, y_train, y_test, parameters = {}):
    dt_clf = DecisionTreeClassifier(**parameters)
    dt_clf.fit(X_word_train, y_word_train)
    dt_accuracy = dt_clf.score(X_word_test, y_word_test)
    return dt_accuracy

### Support Vector Machine

__Default Parameters:__
 - 'C': 1.0
 - 'cache_size': 200
 - 'class_weight': None
 - 'coef0': 0.0
 - 'decision_function_shape': 'ovr'
 - 'degree': 3
 - 'gamma': 'auto'
 - 'kernel': 'rbf'
 - 'max_iter': -1
 - 'probability': False
 - 'random_state': None
 - 'shrinking': True
 - 'tol': 0.001
 - 'verbose': False

In [81]:
def SVM(X_train, X_test, y_train, y_test, parameters = {}):
    svm_clf = svm.SVC(**parameters)
    svm_clf.fit(X_train, y_train)
    svm_accuracy = svm_clf.score(X_test, y_test)
    return svm_accuracy

### XG Boost

__Default Parameters:__
 - 'base_score': 0.5
 - 'booster': 'gbtree'
 - 'colsample_bylevel': 1
 - 'colsample_bytree': 
 - 'gamma': 0
 - 'learning_rate': 0.1
 - 'max_delta_step': 0
 - 'max_depth': 3, 
 - 'min_child_weight': 1
 - 'missing': None
 - 'n_estimators': 100
 - 'nthread': 1
 - 'objective': 'binary:logistic'
 - 'reg_alpha': 0
 - 'reg_lambda': 1
 - 'scale_pos_weight': 1
 - 'seed': 0
 - 'silent': 1 
 - 'subsample': 1

In [69]:
def XG_Boost(X_train, X_test, y_train, y_test, parameters = {}):
    xg_clf = xgb.XGBClassifier(**parameters)
    xg_clf.fit(X_train, y_train)
    xg_accuracy = xg_clf.score(X_test, y_test)
    return xg_accuracy

## 4.2 Defining the train-test split function

In [155]:
def get_train_test(feature_set, test_size=0.2):
    X, y = feature_set.iloc[:,:-1], feature_set.iloc[:,-1]
    X_frame = np.array(X[0].tolist())
    X_train, X_test, y_train, y_test = train_test_split(X_frame, y, test_size=test_size)
    return X_train, X_test, y_train, y_test

## 4.3 Defining the evaluation function

In [120]:
def evaluate(test_frame, predictions):
    with open('checked_list.pickle', 'rb') as f:
        checked_rows = pickle.load(f)
    
    test_frame_local = deepcopy(test_frame)
    test_frame_local.columns = ['Label']
    test_frame_local['Prediction'] = preds
    false_rows = []
    
    def find_dataset_location(index):
        found = data_set_index.loc[(data_set_index['Start'] <= index) & (data_set_index['End'] >= index)]
        found_info = found['File Name'].item().split('\\Corpus\\')[1]
        found_type, found_file = found_info.split('\\')[0], found_info.split('\\')[2]
        index_in_file = index - found['Start'].item() + 1
        return found_type, found_file, index_in_file
    
    
    for index, row in test_frame_local.iterrows():
        i=1
        if row['Prediction'] != row['Label']:
            i+=1
            loc = find_dataset_location(index)
            false_rows.append([index, data_set[index][0], row['Label'], row['Prediction'], loc[0], loc[1], loc[2]])
            
    false_rows.sort(key=operator.itemgetter(0))
    evaluation = pd.DataFrame(false_rows, columns=['Idx', 'Question', 'Label', 'Pred', 'Type', 'File', 'Line in File'])
    display('Found {0} possibly wrongly labeled questions.'.format(len(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])))
    display(evaluation.loc[~evaluation['Idx'].isin(checked_rows)])
    checked_rows.update(evaluation['Idx'].values)

    with open('checked_list.pickle', 'wb') as f:
        pickle.dump(checked_rows, f, protocol=2)


***


# Building the feature bags

In [15]:
start_time = time.time()

tags_bag = set(tag for index, row in data_set_frame.iterrows() for tag in row['PosTags'])
print('Created bag of tags. Amount of tags: {0}'.format(len(tags_bag)))

print('Time taken:', time.time() - start_time)

Created bag of words. Amount of words: 5960
Created bag of ngrams. Amount of ngrams: 19582
Created bag of tags. Amount of tags: 45
Time taken: 0.37308526039123535


# 5 Bag of words


## 5.1 Creating words based feature set

In [158]:
start_time = time.time()

bag_words = set(word for passage in data_set for word in nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]))
print('Created bag of words. Amount of words: {0}'.format(len(words_bag)))

fs_words= pd.DataFrame([([(word in nltk.tokenize.WordPunctTokenizer().tokenize(data[0])) for word in bag_words], data[1]) for data in data_set])
print('Encoded bag of words feature set.')

display('Time taken:', time.time() - start_time)

Created bag of words. Amount of words: 5960
Encoded bag of words feature set.


'Time taken:'

151.46939134597778

## 5.2 Getting data points for training and evaluation

In [159]:
start_time = time.time()

X_word_train, X_word_test, y_word_train, y_word_test = get_train_test(fs_words, 0.2)

display('Time taken:', time.time() - start_time)

'Time taken:'

0.3710825443267822

## 5.3 Training and evaluating the model

####          Naive Bayes

In [160]:
parameters = {}
print(naive_bayes(X_word_train, X_word_test, y_word_train, y_word_test, parameters))

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}
0.8554336989032901


#### Decision Tree

In [161]:
parameters = {}
print(decision_tree(X_word_train, X_word_test, y_word_train, y_word_test, parameters))

{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
0.9481555333998006


#### k-nearest-Neigbors

In [162]:
parameters = {'n_jobs' : -1}
print(k_nearest(X_word_train, X_word_test, y_word_train, y_word_test, parameters))

0.8325024925224327


#### Support Vector Machine

In [163]:
parameters = {'kernel' : 'linear'}
print(SVM(X_word_train, X_word_test, y_word_train, y_word_test, parameters))

0.9411764705882353


#### XGBoost

In [164]:
parameters = {'max_depth' : 5, 'n_estimators' : 100, 'learning_rate' : 0.125, 'min_child_weight' : 1, 'njobs' : -1}
print(XG_Boost(X_word_train, X_word_test, y_word_train, y_word_test, parameters))

0.9391824526420738


***

# 6 Bag of ngrams


## 6.1 Creating bigrams of words based feature set

In [None]:
start_time = time.time()

bag_ngrams = set(gram for passage in data_set for gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(passage[0]), 2))
print('Created bag of ngrams. Amount of ngrams: {0}'.format(len(ngrams_bag)))

fs_ngrams= pd.DataFrame([([(gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(data[0]), 2)) for gram in ngrams_bag], data[1]) for data in data_set])
print('Encoded bag of words feature set.')

display('Time taken:', time.time() - start_time)

Created bag of ngrams. Amount of ngrams: 19582


## 6.2 Creating data points for training and evaluation

In [None]:
start_time = time.time()

x_ngram_train, x_ngram_test, y_ngram_train, y_ngram_test = get_train_test(fs_ngrams, 0.2)

display('Time taken:', time.time() - start_time)

## 6.3 Training and evaluating the model

####          Naive Bayes

In [160]:
parameters = {}
print(naive_bayes(X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test, parameters))

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}
0.8554336989032901


#### Decision Tree

In [161]:
parameters = {}
print(decision_tree(X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test, parameters))

{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
0.9481555333998006


#### k-nearest-Neigbors

In [162]:
parameters = {'n_jobs' : -1}
print(k_nearest(X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test, parameters))

0.8325024925224327


#### Support Vector Machine

In [163]:
parameters = {'kernel' : 'linear'}
print(SVM(X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test, parameters))

0.9411764705882353


#### XGBoost

In [164]:
parameters = {'max_depth' : 6, 'n_estimators' : 200, 'learning_rate' : 0.15, 'min_child_weight' : 1, 'njobs' : -1}
print(XG_Boost(X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test, parameters))

0.9391824526420738


### 7.1 Creating postags based feature set

In [None]:
with open('Classification\\nltk_german_classifier_data.pickle', 'rb') as f:
    tagger = pickle.load(f)
    
data_set_frame = pd.DataFrame(data_set)
data_set_frame['PosTags'] = ''
for i, line in enumerate(data_set_frame[0]):
    sent = nltk.tokenize.WordPunctTokenizer().tokenize(line)
    tag_line = []
    for tag in tagger.tag(sent):
        tag_line.append(tag[1])
    data_set_frame.at[i, 'PosTags'] = tag_line

In [None]:
tag_bag = set(tag for index, row in data_set_frame.iterrows() for tag in row['PosTags'])
fs_pos= [([(tag in row['PosTags']) for tag in tag_bag], row[1]) for item, row in data_set_frame.iterrows()]

In [None]:
bag_pos = pd.DataFrame(fs_pos)
x_pos, y_pos = bag_pos.iloc[:,:-1],bag_pos.iloc[:,-1]
x_frame_pos = pd.DataFrame(x_pos[0].tolist(), columns = tag_bag)
x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(x_frame_pos, y_pos, test_size=0.2)

In [None]:
xg_class_pos = xgb.XGBClassifier(max_depth=6, n_estimators=125, learning_rate=0.125, min_child_weight = 1, njobs=4)
xg_class_pos.fit(x_pos_train, y_pos_train)
preds = xg_class_pos.predict(x_pos_test)
accuracy_score(preds, y_pos_test)

In [None]:
x_frame_com = pd.concat([x_frame_pos, x_frame_word], axis=1)
x_com_train, x_com_test, y_com_train, y_com_test = train_test_split(x_frame_com, y_pos, test_size=0.2)

In [None]:
xg_class_com = xgb.XGBClassifier(max_depth=6, n_estimators=125, learning_rate=0.125, min_child_weight = 0.5, subsample = 0.5, njobs=4)
xg_class_com.fit(x_com_train, y_com_train)
preds = xg_class_com.predict(x_com_test)
accuracy_score(preds, y_com_test)

In [None]:
test_frame = pd.DataFrame(y_com_test)
test_frame.columns=['Label']
test_frame['Prediction']=preds
for index, row in test_frame.iterrows():
    if row['Prediction'] != row['Label']:
        print(data_set[index], 'Label:', row['Label'], 'Prediction:', row['Prediction'] )

In [None]:
preds
wrong = [(index, y_com_test[index])  for index in preds if preds[index] != y_com_test[index]]

In [None]:
wrong_2 = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
count = {}
all_lists = list(pd.DataFrame(data_set)[0])
test_string = ''

for e,i in enumerate(all_lists):
    test_string += ' '
    test_string += i
    if i.split().count('?') < 1:
        print(i, e)
test_string = test_string.split()

for word in words_bag:
    count[word] = test_string.count(word)
import operator
sorted_x = sorted(count.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
frame = pd.DataFrame(sorted_x)
frame.set_index(0, inplace=True)
display(frame)

In [None]:
data_set_frame

In [None]:
from copy import deepcopy
test_frame = deepcopy(data_set_frame)
test_frame['Feature_min'] = ''
for index, row in test_frame.iterrows():
    for word in row[0].rsplit():
        if frame[1][word] > 10:
            test_frame.at[index, 'Feature_min'] += (word + ' ')
        test_frame.at[index, 'Feature_min'] = test_frame.at[index, 'Feature_min']
    print(('Checked row: {}'.format(index)), end='\r')

In [None]:
words_bag_min = set(word for word in words_bag if frame[1][word] > 50)


In [None]:
test_frame

In [None]:
fs_words_min= [([(word in data[2].split()) for word in words_bag_min], data[1]) for index, data in test_frame.iterrows()]
bag_words_min = pd.DataFrame(fs_words_min)

In [None]:
x_word_min, y_word_min = bag_words_min.iloc[:,:-1],bag_words_min.iloc[:,-1]
x_frame_word_min = pd.DataFrame(x_word_min[0].tolist(), columns = words_bag_min)
x_word_min_train, x_word_min_test, y_word_min_train, y_word_min_test = train_test_split(x_frame_word_min, y_word_min, test_size=0.2)

In [None]:
xg_class_word_min = xgb.XGBClassifier(max_depth=7, n_estimators=100, learning_rate=0.125, min_child_weight = 0.4, subsample=0.7, njobs=4)
xg_class_word_min.fit(x_word_min_train, y_word_min_train)
preds = xg_class_word_min.predict(x_word_min_test)
accuracy_score(preds, y_word_min_test)

In [None]:
bag_words

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.125, 1.5],
 'n_estimators':[75, 100],
 'max_depth': [5, 6, 7],
 'min_child_weight': [0.5, 1],
 'subsample': [0.5, 1]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.125,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_word_min_train), np.array(y_word_min_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(xg_class_word, max_num_features=50, height=0.8, ax=ax)
plt.show()

In [None]:
#fig, ax = plt.subplots(figsize=(30, 30))
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(xg_class_word, ax=ax)
plt.show()

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.125, 1.5],
 'n_estimators':[75, 100, 125],
 'max_depth': [4, 5, 6]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.1,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_word_train), np.array(y_word_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:


x, y = bag_ngram.iloc[:,:-1],bag_ngram.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = ngrams_bag)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2)#, random_state=20)

xg_class = xgb.XGBClassifier(max_depth=6, n_estimators=100, learning_rate=0.1, min_child_weight = 1, njobs=4)

In [None]:
xg_class.fit(x_train, y_train)
preds = xg_class.predict(x_test)
accuracy_score(preds, y_test)

In [None]:
fs_ngram = [([(gram in ngrams(nltk.tokenize.WordPunctTokenizer().tokenize(data[0]), 2)) for gram in ngrams_bag], data[1]) for data in data_set]

In [None]:
bag_ngram = pd.DataFrame(fs_ngram)

x, y = bag_ngram.iloc[:,:-1],bag_ngram.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = ngrams_bag)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2)#, random_state=20)

xg_class = xgb.XGBClassifier(max_depth=7, n_estimators=400, learning_rate=0.1, min_child_weight = 1, njobs=4)

In [None]:
xg_class.fit(x_train, y_train)
preds = xg_class.predict(x_test)
accuracy_score(preds, y_test)

In [None]:
bag_encoded = pd.DataFrame(fs_words)

x, y = bag_encoded.iloc[:,:-1],bag_encoded.iloc[:,-1]

x_frame = pd.DataFrame(x[0].tolist(), columns = words_bag)

from sklearn.model_selection import train_test_split
accuracy = []
for i in range(15):
    xg_class = xgb.XGBClassifier(max_depth=5, n_estimators=100, learning_rate=0.125, min_child_weight = 1, njobs=4)
    x_train, x_test, y_train, y_test = train_test_split(x_frame, y, test_size=0.2, random_state=i)
    xg_class.fit(x_train, y_train)
    preds = xg_class.predict(x_test)
    accuracy.append(accuracy_score(preds, y_test))

In [None]:
sum(accuracy) / len(accuracy)

In [None]:
from sklearn.grid_search import GridSearchCV
#Choose all predictors except target & IDcols
param_test1 =  {
 'learning_rate':[0.05, 0.1, 0.15],
 'n_estimators':[75, 100, 125],
 'max_depth': [6, 7],
 'min_child_weight': [1, 2]}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate = 0.1,
                                                      max_depth = 5,
                                                      random_state= 10,
                                                      min_child_weight = 1), 
param_grid = param_test1, n_jobs=4,iid=False, cv=5)
gsearch1.estimator.get_params()
gsearch1.fit(np.array(x_train), np.array(y_train))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
test['Pred']=preds

In [None]:
depth = [3,5, 10]
esti = [50, 100, 300, 1000]
learning_rate = [0.05, 0.2, 0.4]

for d in depth:
    for e in esti:
        for l in learning_rate:
            xg_class = xgb.XGBClassifier(max_depth=d, n_estimators=e, learning_rate=l)
            xg_class.fit(x_train, y_train)
            preds = xg_class.predict(x_test)
            print('Depth:', d, 'Estimators:', e, 'Learning Rate:', l, 'Accuracy', accuracy_score(preds, y_test))

In [None]:
len(preds)*937799043062201

In [None]:
wrong = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
wrong_2 = [(index, test['Pred'][index])  for index in test.index if test[1][index] != test['Pred'][index]]

In [None]:
len(wrong_2)

In [None]:
for i, pred in wrong:
    print(i, data_set[i], 'pred:', pred)