In [1]:
import numpy as np
from utils import load_obj, save_obj, TextFormatter
from classifier.features import FeatureGenerator
from random import randint

In [2]:
lm = load_obj("LanguageModel")

In [3]:
def get_formatted_text(text):
    textFormatter = TextFormatter(text)
    words = textFormatter.get_query_list()
    query = textFormatter.text
    return query, words

In [4]:
X = []
y = []
fg = FeatureGenerator(lm)
i = 0
with open("queries_all.txt") as f:
    lines = f.readlines()

for line in lines:
    if randint(1, 100) < 97:
        continue
        
    queries = line.split('\t')
    
    if len(queries) == 2:
        y.append(0)
        query, words = get_formatted_text(queries[0])
        X.append(fg.generate_features(query, words))
        
        y.append(1)
        query, words = get_formatted_text(queries[1])
        X.append(fg.generate_features(query, words))
        
    else:
        y.append(1)
        query, words = get_formatted_text(queries[0])
        X.append(fg.generate_features(query, words))

In [5]:
len(y), len(X)

(84706, 84706)

In [6]:
X = np.asarray(X)
y = np.asarray(y)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

In [8]:
f1 = []
acc = []
kf = KFold(n_splits=4, shuffle=True)
for train_index, test_index in kf.split(X):
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_test = X[train_index], X[test_index]
    
    gb = GradientBoostingClassifier(n_estimators=200, max_depth=5, loss='exponential')
    gb.fit(X_train, y_train)
    
    y_pred = gb.predict(X_test)
    f1.append(f1_score(y_test, y_pred, pos_label=0))
    acc.append(accuracy_score(y_test, y_pred))
    
print(sum(f1)/len(f1))
print(sum(acc)/len(acc))

0.832405491598253
0.9844521091000871


In [9]:
save_obj(gb, "Classifier")