In [1]:
from scipy.stats import spearmanr

import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC 
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import string

import itertools

np.random.seed(1337)

In [2]:
with open("/usr/share/dict/words") as wordfile:
    words = set(x.strip().lower() for x in wordfile.readlines())

In [3]:
def spearmancorr(est,X,y):
    rho, pval = spearmanr(np.reshape(y, (-1, 1)), np.reshape(est.predict(X), (-1, 1)), axis=0)
    return rho

In [6]:
def process_data(path, exclude=[], train=True, vectorizers=[None]*4):
    def process_file(path):
        user = {'Alice': "A",'Bob': "B"}
        
        df = pd.read_json(path).set_index("dialogId")
        df['speaker'] = df["thread"].apply(lambda x: [user[msg['userId']] for msg in x])
        df['thread'] = df["thread"].apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
        df['thread_raw'] = df["thread"].apply(lambda x: " ".join(x))
        if train:
            df["qualA"] = df["evaluation"].apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
            df["qualB"] = df["evaluation"].apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
            df["botA"] = df["users"].apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
            df["botB"] = df["users"].apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
        df.drop(['users'], axis=1, inplace=True)
        if train:
            df.drop(['evaluation'], axis=1, inplace=True)
            
        return df
    
    def add_features(data, tfidf_thread=None, tfidf_context=None, count_thread=None, count_context=None):
        def preprocess(text, lower, punctuation):
            if lower:
                text = text.lower()
            if punctuation == "exclude":
                text = text.translate(str.maketrans({p: None for p in string.punctuation}))
            elif punctuation == "separate":
                text = text.translate(str.maketrans({p: " {} ".format(p) for p in string.punctuation}))

            return text

        punct_modes = ["exclude", "separate", "leave"]
        lower_modes = [True, False]
                
        for preproc_mode in itertools.product(lower_modes, punct_modes):            
            preproc = lambda text: preprocess(text, *preproc_mode)

            def get_speaker(speaker):
                return lambda row: [preproc(x) for x in np.array(row['thread'])[np.array(row['speaker']) == speaker]]

            data["thread_split_A_{}_{}".format(*preproc_mode)] = data.apply(get_speaker("A"), axis=1)
            data['thread_split_B_{}_{}'.format(*preproc_mode)] = data.apply(get_speaker("B"), axis=1)
            
            def join_speaker(speaker):
                return lambda row: " ".join(row["thread_split_{}_{}_{}".format(speaker, *preproc_mode)])
            
            data["thread_joined_A_{}_{}".format(*preproc_mode)] = data.apply(join_speaker("A"), axis=1)
            data["thread_joined_B_{}_{}".format(*preproc_mode)] = data.apply(join_speaker("B"), axis=1)

            def get_first(speaker):
                return lambda row: " ".join(row["thread_split_{}_{}_{}".format(speaker, *preproc_mode)])
            
            data["start_A_{}_{}".format(*preproc_mode)] = data.apply(get_first("A"), axis=1)
            data["start_B_{}_{}".format(*preproc_mode)] = data.apply(get_first("B"), axis=1)

            if not tfidf_thread:
                tfidf_thread = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000)
                tfidf_thread.fit(data["thread_raw"])
            
            data["tfidf_all_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_raw"]).toarray().tolist()
            data["tfidf_A_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_joined_A_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_B_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_joined_B_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_start_A_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["start_A_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_start_B_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["start_B_{}_{}".format(*preproc_mode)]).toarray().tolist()

            if not tfidf_context:
                tfidf_context = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000)
                tfidf_context.fit(data["context"])

            data["tfidf_context_{}_{}".format(*preproc_mode)] = tfidf_context.transform(data["context"]).toarray().tolist()

            if not count_thread:
                count_thread = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000)
                count_thread.fit(data["thread_raw"])

            data["counts_all_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_raw"]).toarray().tolist()
            data["counts_A_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_joined_A_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_B_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_joined_B_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_start_A_{}_{}".format(*preproc_mode)] = count_thread.transform(data["start_A_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_start_B_{}_{}".format(*preproc_mode)] = count_thread.transform(data["start_B_{}_{}".format(*preproc_mode)]).toarray().tolist()

            if not count_context:
                count_context = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000)
                count_context.fit(data["context"])

            data["counts_context_{}_{}".format(*preproc_mode)] = count_context.transform(data["context"]).toarray().tolist()
            
            def run_len(target, func):
                return lambda row: [func((len(list(g)) for person, g in itertools.groupby(row["speaker"]) if person == target), default=0)]

            data["f_max_run_A_{}_{}".format(*preproc_mode)] = data.apply(run_len("A", max), axis=1)
            data["f_max_run_B_{}_{}".format(*preproc_mode)] = data.apply(run_len("B", max), axis=1)
            data["f_min_run_A_{}_{}".format(*preproc_mode)] = data.apply(run_len("A", min), axis=1)
            data["f_min_run_B_{}_{}".format(*preproc_mode)] = data.apply(run_len("B", min), axis=1)

            def typo_count(target):
                return lambda row: [sum(1 for word in preproc(row["thread_joined_{}_{}_{}".format(target, *preproc_mode)]) if word not in words)]

            data["f_typos_A_{}_{}".format(*preproc_mode)] = data.apply(typo_count("A"), axis=1)
            data["f_typos_B_{}_{}".format(*preproc_mode)] = data.apply(typo_count("B"), axis=1)
            data["f_typos_frac_A_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_typos_A_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_A_{}_{}".format(*preproc_mode)]).split()))], axis=1)
            data["f_typos_frac_B_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_typos_B_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_B_{}_{}".format(*preproc_mode)]).split()))], axis=1)

            
            def relevant_words(target):
                return lambda row: [sum(1 for word in preproc(row["thread_joined_{}_{}_{}".format(target, *preproc_mode)]) if word in preproc(row['context']))]
        
            data["f_relevant_A_{}_{}".format(*preproc_mode)] = data.apply(relevant_words("A"), axis=1)
            data["f_relevant_B_{}_{}".format(*preproc_mode)] = data.apply(relevant_words("B"), axis=1)
            data["f_relevant_frac_A_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_relevant_A_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_A_{}_{}".format(*preproc_mode)]).split()))], axis=1)
            data["f_relevant_frac_B_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_relevant_B_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_B_{}_{}".format(*preproc_mode)]).split()))], axis=1)

        return data, tfidf_thread, tfidf_context, count_thread, count_context
            
    if os.path.isdir(path):
        data = pd.concat(
            [
                process_file(os.path.join(path, file))
                for file in os.listdir(path)
                if (
                    file.startswith("train") and train or
                    file.startswith("test") and not train
                ) and file not in exclude
            ]
        )
    else:            
        data = process_file(path)
        
    data, *vectorizers = add_features(data, *vectorizers)
    
    return data, vectorizers

In [30]:
data, vectorizers = process_data("../data/", train=True)



In [31]:
feat_templates = [
    'counts_all_{}_{}',
    'counts_A_{}_{}',
    'counts_B_{}_{}',
    'counts_start_A_{}_{}',
    'counts_start_B_{}_{}',
    'counts_context_{}_{}',
    'f_max_run_A_{}_{}',
    'f_max_run_B_{}_{}',
    'f_min_run_A_{}_{}',
    'f_min_run_B_{}_{}',
    'f_typos_A_{}_{}',
    'f_typos_B_{}_{}',
    'f_typos_frac_A_{}_{}',
    'f_typos_frac_B_{}_{}',
    'f_relevant_A_{}_{}',
    'f_relevant_B_{}_{}',
    'f_relevant_frac_A_{}_{}',
    'f_relevant_frac_B_{}_{}',
]

features = []

punct_modes = ["exclude", "separate", "leave"]
lower_modes = [True, False]

for template, *preproc_mode in itertools.product(feat_templates, lower_modes, punct_modes):            
    features.append(template.format(*preproc_mode))

In [32]:
X =((data[features]).values)
X = np.stack([np.concatenate(X[i]) for i in range(X.shape[0])])

y_A = data["qualA"].values
y_B = data["qualB"].values

In [33]:
clf_A = LGBMRegressor(n_estimators=100, num_leaves=1000)
clf_A.fit(X, y_A)
clf_B = LGBMRegressor(n_estimators=100, num_leaves=1000)
clf_B.fit(X, y_B)

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
       fair_c=1.0, gaussian_eta=1.0, huber_delta=1.0, learning_rate=0.1,
       max_bin=255, max_depth=-1, max_drop=50, min_child_samples=10,
       min_child_weight=5, min_split_gain=0, n_estimators=100, nthread=-1,
       num_leaves=1000, objective='regression', poisson_max_delta_step=0.7,
       reg_alpha=0, reg_lambda=0, seed=0, silent=True, skip_drop=0.5,
       subsample=1, subsample_for_bin=50000, subsample_freq=1,
       uniform_drop=False, xgboost_dart_mode=False)

In [34]:
test, *_ = process_data("../data/test_20170727.json", train=False, vectorizers=vectorizers)



In [35]:
T =((test[features]).values)
T = np.stack([np.concatenate(T[i]) for i in range(T.shape[0])])

In [36]:
pred_A = clf_A.predict(T)
pred_B = clf_B.predict(T)

In [37]:
pd.DataFrame(np.stack([pred_A, pred_B]).T, index=test.index, columns=["Alice", "Bob"]).to_csv("pred.csv")