In [14]:
import pickle
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb

### form dataset

In [2]:
with open("bad_embeddings.pkl", "rb") as f:
    bad_embeddings = pickle.load(f)
    
with open("good_embeddings.pkl", "rb") as f:
    good_embeddings = pickle.load(f)

In [3]:
def drop_nans(embed_list):
    return [embedding for embedding in embed_list if embedding is not None]

bad_embeddings = drop_nans(bad_embeddings)
good_embeddings = drop_nans(good_embeddings)

In [4]:
target_colname = "reliability"

In [5]:
good_embeds_df = pd.DataFrame(good_embeddings)
good_embeds_df[target_colname] = 1

bad_embeds_df = pd.DataFrame(bad_embeddings)
bad_embeds_df[target_colname] = 0

In [6]:
dataset = pd.concat([good_embeds_df, bad_embeds_df], axis=0)
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [7]:
train = dataset.iloc[:200]
test = dataset.iloc[200:]

In [9]:
def split_train_test_x_y(dataset, train_idx, test_idx):
    train = dataset.iloc[train_idx]
    test = dataset.iloc[test_idx]
    train_x = train[train.columns[:-1]]
    train_y = train[target_colname]

    test_x = test[test.columns[:-1]]
    test_y = test[target_colname]
    return train_x, test_x, train_y, test_y

In [15]:
kfold = StratifiedKFold(5)
accs = []

target = dataset[target_colname]
for train_idx, test_idx in kfold.split(dataset, target):
    train_x, test_x, train_y, test_y = split_train_test_x_y(dataset, train_idx, test_idx)
    train_data = lgb.Dataset(train_x, label=train_y)
    
    param = {'objective': 'binary', "verbosity": -1}
    model = lgb.train(param, train_data)
    preds = model.predict(test_x)
    
    preds[preds < 0.5] = 0
    preds[preds >= 0.5] = 1
    
    acc = accuracy_score(preds, test_y)
    print(f"Accuracy: {acc}")
    accs.append(acc)
print(np.mean(accs))

Accuracy: 0.8823529411764706
Accuracy: 0.7254901960784313
Accuracy: 0.78
Accuracy: 0.82
Accuracy: 0.72
0.7855686274509803


In [19]:
all_data_train = lgb.Dataset(dataset[dataset.columns[:-1]], label=target)
all_data_model = lgb.train(param, all_data_train)

In [24]:
# lgb.save(all_data_model, "nlp_scoring.txt")
all_data_model.save_model("nlp_scoring.txt")

<lightgbm.basic.Booster at 0x7f1ab62557c0>

In [25]:
bst = lgb.Booster(model_file="nlp_scoring.txt")
    

In [26]:
bst.predict(dataset[dataset.columns[:-1]].iloc[0])



array([0.00230341])