In [1]:
import json

import os
from scipy.stats import spearmanr

import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier

In [2]:
np.random.seed(1337)

In [3]:
def process_data(path, exclude=[], train=True):
    user = {'Alice': 1,'Bob': 2}
    if os.path.isdir(path):
        data = pd.DataFrame()

        for file in os.listdir(path):
            if (file.startswith("train") and train or file.startswith("test") and not train) and file not in exclude:
                print("{} loaded".format(file))
                df = pd.read_json(os.path.join(path, file)).set_index("dialogId")
                df['speaker'] = df.thread.apply(lambda x: [user[msg['userId']] for msg in x])
                df['thread'] = df.thread.apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
                df['thread_raw'] = df.thread.apply(lambda x: " ".join(x))
                if train:
                    df["qualA"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
                    df["qualB"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
                    df["botA"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
                    df["botB"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
                df.drop(['users'], axis=1, inplace=True)
                if train:
                    df.drop(['evaluation'], axis=1, inplace=True)

                data = pd.concat([data, df])
    else:
        df = pd.read_json(path).set_index("dialogId")
        df['speaker'] = df.thread.apply(lambda x: [user[msg['userId']] for msg in x])
        df['thread'] = df.thread.apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
        df['thread_raw'] = df.thread.apply(lambda x: " ".join(x))
        if train:
            df["qualA"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
            df["qualB"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
            df["botA"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
            df["botB"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
        df.drop(['users'], axis=1, inplace=True)
        if train:
            df.drop(['evaluation'], axis=1, inplace=True)
            
        data = df
        
    return data

In [4]:
def prepare(data):
    features = []
    answers = []
    
    for row in data.iterrows():
        features.append(np.concatenate([row[1]['counts_A'], row[1]['tfidf_A'], row[1]['counts_B'], row[1]['tfidf_B']]))
        
        try:
            answers.append(row[1]["qualA"])
            train = True
        except:
            train = False


    features = np.stack(features)
    
    if train:
        answers = np.stack(answers)
    
    if train:
        return (features, answers)
    else:
        return features

In [5]:
def spearmancorr(est,X,y):
    rho, pval = spearmanr(np.reshape(y, (-1, 1)), np.reshape(est.predict(X), (-1, 1)), axis=0)
    return rho

In [6]:
def filter_(person):
    def f(row, speaker):
        return " ".join((np.array(row['thread'])[np.array(row['speaker']) == speaker]))

    return lambda x: f(x, person)

In [7]:
data = process_data("../data/", train=True).sample(frac=1).reset_index(drop=True)

data["thread_A"] = data.apply(filter_(1), axis=1)
data['thread_B'] = data.apply(filter_(2), axis=1)

tfidf = TfidfVectorizer(strip_accents='unicode')
data["tfidf_all"] = tfidf.fit_transform(data["thread_raw"]).toarray().tolist()
data["tfidf_A"] = tfidf.transform(data["thread_A"]).toarray().tolist()
data["tfidf_B"] = tfidf.transform(data["thread_B"]).toarray().tolist()

count = CountVectorizer(strip_accents='unicode')
data["counts_all"] = count.fit_transform(data["thread_raw"]).toarray().tolist()
data["counts_A"] = count.transform(data["thread_A"]).toarray().tolist()
data["counts_B"] = count.transform(data["thread_B"]).toarray().tolist()


X, y = prepare(data)

train_20170724.json loaded
train_20170725.json loaded
train_20170726.json loaded


In [9]:
clf = BaggingClassifier(LGBMClassifier(n_estimators=100), max_samples=0.8, n_estimators=10)

cv = cross_val_score(clf, X, y, cv=10, verbose=3, scoring=spearmancorr)

cv.mean(), cv.std()

[CV]  ................................................................
[CV] ................................. , score=0.482771, total=  26.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.8s remaining:    0.0s


[CV] ................................. , score=0.497285, total=  26.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   53.8s remaining:    0.0s


[CV] ................................. , score=0.321042, total=  23.4s
[CV]  ................................................................
[CV] ................................. , score=0.389069, total=  27.7s
[CV]  ................................................................
[CV] ................................. , score=0.583113, total=  29.8s
[CV]  ................................................................
[CV] ................................. , score=0.342824, total=  27.3s
[CV]  ................................................................
[CV] ................................. , score=0.538057, total=  29.4s
[CV]  ................................................................
[CV] ................................. , score=0.472082, total=  24.8s
[CV]  ................................................................
[CV] ................................. , score=0.511733, total=  31.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.6min finished


(0.46226242658776728, 0.080378973613598484)

In [10]:
train = process_data("../data", exclude=["train_20170726.json"], train=True)

train["thread_A"] = train.apply(filter_(1), axis=1)
train['thread_B'] = train.apply(filter_(2), axis=1)

tfidf = TfidfVectorizer(strip_accents='unicode')
train["tfidf_all"] = tfidf.fit_transform(train["thread_raw"]).toarray().tolist()
train["tfidf_A"] = tfidf.transform(train["thread_A"]).toarray().tolist()
train["tfidf_B"] = tfidf.transform(train["thread_B"]).toarray().tolist()

count = CountVectorizer(strip_accents='unicode')
train["counts_all"] = count.fit_transform(train["thread_raw"]).toarray().tolist()
train["counts_A"] = count.transform(train["thread_A"]).toarray().tolist()
train["counts_B"] = count.transform(train["thread_B"]).toarray().tolist()

train_20170724.json loaded
train_20170725.json loaded


In [11]:
test = process_data("../data/train_20170726.json", train=True)

test["thread_A"] = test.apply(filter_(1), axis=1)
test['thread_B'] = test.apply(filter_(2), axis=1)

test["tfidf_all"] = tfidf.transform(test["thread_raw"]).toarray().tolist()
test["tfidf_A"] = tfidf.transform(test["thread_A"]).toarray().tolist()
test["tfidf_B"] = tfidf.transform(test["thread_B"]).toarray().tolist()

test["counts_all"] = count.transform(test["thread_raw"]).toarray().tolist()
test["counts_A"] = count.transform(test["thread_A"]).toarray().tolist()
test["counts_B"] = count.transform(test["thread_B"]).toarray().tolist()


In [12]:
X, y = prepare(train)
t, t_y = prepare(test)

In [13]:
#clf = BaggingClassifier(LGBMClassifier(n_estimators=100), max_samples=0.8, n_estimators=50)
clf.fit(X, y)

spearmancorr(clf, t, t_y)

0.47179145078532186