In [115]:
from scipy.stats import spearmanr

import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC 
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier

import itertools

In [2]:
np.random.seed(1337)

In [3]:
def process_data(path, exclude=[], train=True):
    user = {'Alice': 1,'Bob': 2}
    if os.path.isdir(path):
        data = pd.DataFrame()

        for file in os.listdir(path):
            if (file.startswith("train") and train or file.startswith("test") and not train) and file not in exclude:
                print("{} loaded".format(file))
                df = pd.read_json(os.path.join(path, file)).set_index("dialogId")
                df['speaker'] = df.thread.apply(lambda x: [user[msg['userId']] for msg in x])
                df['thread'] = df.thread.apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
                df['thread_raw'] = df.thread.apply(lambda x: " ".join(x))
                if train:
                    df["qualA"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
                    df["qualB"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
                    df["botA"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
                    df["botB"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
                df.drop(['users'], axis=1, inplace=True)
                if train:
                    df.drop(['evaluation'], axis=1, inplace=True)

                data = pd.concat([data, df])
    else:
        df = pd.read_json(path).set_index("dialogId")
        df['speaker'] = df.thread.apply(lambda x: [user[msg['userId']] for msg in x])
        df['thread'] = df.thread.apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
        df['thread_raw'] = df.thread.apply(lambda x: " ".join(x))
        if train:
            df["qualA"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
            df["qualB"] = df.evaluation.apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
            df["botA"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
            df["botB"] = df.users.apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
        df.drop(['users'], axis=1, inplace=True)
        if train:
            df.drop(['evaluation'], axis=1, inplace=True)
            
        data = df
        
    return data

In [4]:
with open("/usr/share/dict/words") as wordfile:
    words = set(x.strip().lower() for x in wordfile.readlines())

In [107]:
def prepare(data):
    features = []
    answers = []
    
    for row in data.iterrows():
        f = [
            [max([0]+[len(list(x)) for x in (g for k, g in itertools.groupby(row[1]["speaker"]) if k == 1)])],
            [max([0]+[len(list(x)) for x in (g for k, g in itertools.groupby(row[1]["speaker"]) if k == 2)])],
            [min([0]+[len(list(x)) for x in (g for k, g in itertools.groupby(row[1]["speaker"]) if k == 1)])],
            [min([0]+[len(list(x)) for x in (g for k, g in itertools.groupby(row[1]["speaker"]) if k == 2)])],
            [sum(1 for word in row[1]["thread_A"].lower().split() if word not in words)],
            [sum(1 for word in row[1]["thread_B"].lower().split() if word not in words)],
            [sum(1 for word in row[1]["thread_A"].lower().split() if word in row[1]['context'].lower().split())],
            [sum(1 for word in row[1]["thread_B"].lower().split() if word in row[1]['context'].lower().split())],
            [sum(1 for word in row[1]["thread_A"].lower().split() if word not in words) / (1 + len(row[1]['thread_A'].lower().split()))],
            [sum(1 for word in row[1]["thread_B"].lower().split() if word not in words) / (1 + len(row[1]['thread_B'].lower().split()))],
            [sum(1 for word in row[1]["thread_A"].lower().split() if word in row[1]['context'].lower().split()) / (1 + len(row[1]['thread_A'].lower().split()))],
            [sum(1 for word in row[1]["thread_B"].lower().split() if word in row[1]['context'].lower().split()) / (1 + len(row[1]['thread_B'].lower().split()))],
            row[1]['counts_A'],
            row[1]['counts_B'],
            #row[1]['counts_context'],
            row[1]['counts_start_A'],
            row[1]['counts_start_B'],
            #row[1]['tfidf_A'],
            #row[1]['tfidf_B'],
            #row[1]['tfidf_context'],
            #row[1]['tfidf_start_A'],
            #row[1]['tfidf_start_B'],
        ]
        features.append(np.concatenate(f))
        
        try:
            answers.append(row[1]["qualB"])
            train = True
        except:
            train = False


    features = np.stack(features)
    
    if train:
        answers = np.stack(answers)
    
    if train:
        return (features, answers)
    else:
        return features

In [108]:
def spearmancorr(est,X,y):
    rho, pval = spearmanr(np.reshape(y, (-1, 1)), np.reshape(est.predict(X), (-1, 1)), axis=0)
    return rho

In [109]:
def filter_(person, first=False):
    def f(row, speaker):
        messages = np.array(row['thread'])[np.array(row['speaker']) == speaker]
        if first:
            messages = messages[:1]
        
        return " ".join(messages)

    return lambda x: f(x, person)

In [110]:
data = process_data("../data/", train=True).sample(frac=1).reset_index(drop=True)

data["thread_A"] = data.apply(filter_(1), axis=1)
data['thread_B'] = data.apply(filter_(2), axis=1)

data["start_A"] = data.apply(filter_(1, True), axis=1)
data["start_B"] = data.apply(filter_(2, True), axis=1)

tfidf_thread = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 2), max_features=2000)
data["tfidf_all"] = tfidf_thread.fit_transform(data["thread_raw"]).toarray().tolist()
data["tfidf_A"] = tfidf_thread.transform(data["thread_A"]).toarray().tolist()
data["tfidf_B"] = tfidf_thread.transform(data["thread_B"]).toarray().tolist()
data["tfidf_start_A"] = tfidf_thread.transform(data["start_A"]).toarray().tolist()
data["tfidf_start_B"] = tfidf_thread.transform(data["start_B"]).toarray().tolist()

tfidf_context = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 2), max_features=2000)
data["tfidf_context"] = tfidf_context.fit_transform(data["context"]).toarray().tolist()


count_thread = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 2), max_features=2000)
data["counts_all"] = count_thread.fit_transform(data["thread_raw"]).toarray().tolist()
data["counts_A"] = count_thread.transform(data["thread_A"]).toarray().tolist()
data["counts_B"] = count_thread.transform(data["thread_B"]).toarray().tolist()
data["counts_start_A"] = count_thread.transform(data["start_A"]).toarray().tolist()
data["counts_start_B"] = count_thread.transform(data["start_B"]).toarray().tolist()

count_context = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 2), max_features=2000)
data["counts_context"] = count_context.fit_transform(data["context"]).toarray().tolist()

train_20170724.json loaded
train_20170725.json loaded
train_20170726.json loaded


In [111]:
X, y = prepare(data)

In [116]:
clf = VotingClassifier([('gbm', GradientBoostingClassifier()), ('svc', LinearSVC(tol=0.1)), ('lgbm', LGBMClassifier(n_estimators=100, num_leaves=1000))])

#gcv = GridSearchCV(clf, {"n_neighbors":[1, 5, 10, 15, 25, 40, 50, 75, 100, 150, 250]}, scoring=spearmancorr, verbose=3)
#gcv.fit(X, y)

cv = cross_val_score(LGBMRegressor(n_estimators=100, num_leaves=1000), X, y, cv=KFold(10,shuffle=True,random_state=123), verbose=3, scoring=spearmancorr)
cv.mean(), cv.std()

[CV]  ................................................................
[CV] ................................. , score=0.552220, total=   1.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ................................. , score=0.634135, total=   1.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV] ................................. , score=0.641359, total=   1.3s
[CV]  ................................................................
[CV] ................................. , score=0.632248, total=   1.4s
[CV]  ................................................................
[CV] ................................. , score=0.529679, total=   1.4s
[CV]  ................................................................
[CV] ................................. , score=0.591046, total=   1.3s
[CV]  ................................................................
[CV] ................................. , score=0.638460, total=   1.3s
[CV]  ................................................................
[CV] ................................. , score=0.691395, total=   1.3s
[CV]  ................................................................
[CV] ................................. , score=0.476927, total=   1.3s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   14.2s finished


(0.59735832417276413, 0.060270969812778144)

In [None]:
cv = cross_val_score(LGBMRegressor(n_estimators=100, num_leaves=1000), X, y, cv=10, verbose=3, scoring=spearmancorr)
cv.mean(), cv.std()

#(0.59851429398955014, 0.060098095791398666)