In [52]:
from scipy.stats import spearmanr

import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC 
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import string
from nltk.tokenize import TreebankWordTokenizer

import itertools
import stop_words

np.random.seed(1337)

In [2]:
with open("/usr/share/dict/words") as wordfile:
    words = set(x.strip().lower() for x in wordfile.readlines())

In [3]:
def spearmancorr(est,X,y):
    rho, pval = spearmanr(np.reshape(y, (-1, 1)), np.reshape(est.predict(X), (-1, 1)), axis=0)
    return rho

In [66]:
def process_data(path, exclude=[], train=True, vectorizers=None):
    def process_file(path):
        user = {'Alice': "A",'Bob': "B"}
        
        df = pd.read_json(path).set_index("dialogId")
        df['speaker'] = df["thread"].apply(lambda x: [user[msg['userId']] for msg in x])
        df['thread'] = df["thread"].apply(lambda x: [msg['text'] for msg in x], convert_dtype=False)
        df['thread_raw'] = df["thread"].apply(lambda x: " ".join(x))
        if train:
            df["qualA"] = df["evaluation"].apply(lambda x: sorted(x, key=lambda x: x['userId'])[0]['quality'])
            df["qualB"] = df["evaluation"].apply(lambda x: sorted(x, key=lambda x: x['userId'])[1]['quality'])
            df["botA"] = df["users"].apply(lambda x: sorted(x, key=lambda x: x['id'])[0]['userType'] == 'Bot')
            df["botB"] = df["users"].apply(lambda x: sorted(x, key=lambda x: x['id'])[1]['userType'] == 'Bot')
        df.drop(['users'], axis=1, inplace=True)
        if train:
            df.drop(['evaluation'], axis=1, inplace=True)
            
        return df
    
    def add_features(data, vectorizers):
        def preprocess(text, lower, punctuation, stops):
            if lower:
                text = text.lower()
            if punctuation == "exclude":
                text = text.translate(str.maketrans({p: None for p in string.punctuation}))
            elif punctuation == "separate":
                text = text.translate(str.maketrans({p: " {} ".format(p) for p in string.punctuation}))
            if stops:
                stopwords = set(stop_words.get_stop_words("english"))
                punct = str.maketrans({p: None for p in string.punctuation})
                text = " ".join(word for word in text.split(" ") if word.lower().translate(punct) not in stopwords)
                
            return text

        punct_modes = ["exclude", "separate", "leave"]
        lower_modes = [True, False]
        stops_modes = [True, False]
        
        new_vectorizers = {}
        
        for preproc_mode in itertools.product(lower_modes, punct_modes, stops_modes):            
            preproc = lambda text: preprocess(text, *preproc_mode)
            
            if vectorizers:
                tfidf_thread, tfidf_context, count_thread, count_context = vectorizers["{}_{}_{}".format(*preproc_mode)]
            else:
                tfidf_thread = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000, tokenizer=TreebankWordTokenizer().tokenize)
                tfidf_thread.fit(data["thread_raw"])
                tfidf_context = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000, tokenizer=TreebankWordTokenizer().tokenize)
                tfidf_context.fit(data["context"])
                count_thread = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000, tokenizer=TreebankWordTokenizer().tokenize)
                count_thread.fit(data["thread_raw"])
                count_context = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=4000, tokenizer=TreebankWordTokenizer().tokenize)
                count_context.fit(data["context"])
                
                new_vectorizers["{}_{}_{}".format(*preproc_mode)] = (tfidf_thread, tfidf_context, count_thread, count_context)
                
            def get_speaker(speaker):
                return lambda row: [preproc(x) for x in np.array(row['thread'])[np.array(row['speaker']) == speaker]]

            data["thread_split_A_{}_{}_{}".format(*preproc_mode)] = data.apply(get_speaker("A"), axis=1)
            data['thread_split_B_{}_{}_{}'.format(*preproc_mode)] = data.apply(get_speaker("B"), axis=1)
            
            def join_speaker(speaker):
                return lambda row: " ".join(row["thread_split_{}_{}_{}_{}".format(speaker, *preproc_mode)])
            
            data["thread_joined_A_{}_{}_{}".format(*preproc_mode)] = data.apply(join_speaker("A"), axis=1)
            data["thread_joined_B_{}_{}_{}".format(*preproc_mode)] = data.apply(join_speaker("B"), axis=1)

            def get_first(speaker):
                return lambda row: " ".join(row["thread_split_{}_{}_{}_{}".format(speaker, *preproc_mode)])
            
            data["start_A_{}_{}_{}".format(*preproc_mode)] = data.apply(get_first("A"), axis=1)
            data["start_B_{}_{}_{}".format(*preproc_mode)] = data.apply(get_first("B"), axis=1)
            
            data["tfidf_all_{}_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_raw"]).toarray().tolist()
            data["tfidf_A_{}_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_joined_A_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_B_{}_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["thread_joined_B_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_start_A_{}_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["start_A_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["tfidf_start_B_{}_{}_{}".format(*preproc_mode)] = tfidf_thread.transform(data["start_B_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()

            data["tfidf_context_{}_{}_{}".format(*preproc_mode)] = tfidf_context.transform(data["context"]).toarray().tolist()

            data["counts_all_{}_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_raw"]).toarray().tolist()
            data["counts_A_{}_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_joined_A_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_B_{}_{}_{}".format(*preproc_mode)] = count_thread.transform(data["thread_joined_B_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_start_A_{}_{}_{}".format(*preproc_mode)] = count_thread.transform(data["start_A_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()
            data["counts_start_B_{}_{}_{}".format(*preproc_mode)] = count_thread.transform(data["start_B_{}_{}_{}".format(*preproc_mode)]).toarray().tolist()

            data["counts_context_{}_{}_{}".format(*preproc_mode)] = count_context.transform(data["context"]).toarray().tolist()
            
            def run_len(target, func):
                return lambda row: [func((len(list(g)) for person, g in itertools.groupby(row["speaker"]) if person == target), default=0)]

            data["f_max_run_A_{}_{}_{}".format(*preproc_mode)] = data.apply(run_len("A", max), axis=1)
            data["f_max_run_B_{}_{}_{}".format(*preproc_mode)] = data.apply(run_len("B", max), axis=1)
            data["f_min_run_A_{}_{}_{}".format(*preproc_mode)] = data.apply(run_len("A", min), axis=1)
            data["f_min_run_B_{}_{}_{}".format(*preproc_mode)] = data.apply(run_len("B", min), axis=1)

            def typo_count(target):
                return lambda row: [sum(1 for word in preproc(row["thread_joined_{}_{}_{}_{}".format(target, *preproc_mode)]) if word not in words)]

            data["f_typos_A_{}_{}_{}".format(*preproc_mode)] = data.apply(typo_count("A"), axis=1)
            data["f_typos_B_{}_{}_{}".format(*preproc_mode)] = data.apply(typo_count("B"), axis=1)
            data["f_typos_frac_A_{}_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_typos_A_{}_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_A_{}_{}_{}".format(*preproc_mode)]).split()))], axis=1)
            data["f_typos_frac_B_{}_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_typos_B_{}_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_B_{}_{}_{}".format(*preproc_mode)]).split()))], axis=1)

            
            def relevant_words(target):
                return lambda row: [sum(1 for word in preproc(row["thread_joined_{}_{}_{}_{}".format(target, *preproc_mode)]) if word in preproc(row['context']))]
        
            data["f_relevant_A_{}_{}_{}".format(*preproc_mode)] = data.apply(relevant_words("A"), axis=1)
            data["f_relevant_B_{}_{}_{}".format(*preproc_mode)] = data.apply(relevant_words("B"), axis=1)
            data["f_relevant_frac_A_{}_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_relevant_A_{}_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_A_{}_{}_{}".format(*preproc_mode)]).split()))], axis=1)
            data["f_relevant_frac_B_{}_{}_{}".format(*preproc_mode)] = data.apply(lambda row: [row["f_relevant_B_{}_{}_{}".format(*preproc_mode)][0] / (1 + len(preproc(row["thread_joined_B_{}_{}_{}".format(*preproc_mode)]).split()))], axis=1)

            def unanswered_messages(target):
                return lambda row: [sum(len(list(g)) == 1 for person, g in itertools.groupby(row["speaker"]) if person == target)]

            data["f_unanswered_A_{}_{}_{}".format(*preproc_mode)] = data.apply(unanswered_messages("B"), axis=1)
            data["f_unanswered_B_{}_{}_{}".format(*preproc_mode)] = data.apply(unanswered_messages("A"), axis=1)

        if not vectorizers:
            vectorizers = new_vectorizers
            
        return data, vectorizers
            
    if os.path.isdir(path):
        data = pd.concat(
            [
                process_file(os.path.join(path, file))
                for file in os.listdir(path)
                if (
                    file.startswith("train") and train or
                    file.startswith("test") and not train
                ) and file not in exclude
            ]
        )
    else:            
        data = process_file(path)
        
    data, vectorizers = add_features(data, vectorizers)
    
    return data, vectorizers

In [67]:
data, vectorizers = process_data("../data/", train=True)



In [70]:
feat_templates = [
    #'counts_all_{}_{}_{}',
    'counts_A_{}_{}_{}',
    'counts_B_{}_{}_{}',
    'counts_start_A_{}_{}_{}',
    'counts_start_B_{}_{}_{}',
    #'counts_context_{}_{}_{}',
    #'tfidf_all_{}_{}_{}',
    #'tfidf_A_{}_{}_{}',
    #'tfidf_B_{}_{}_{}',
    #'tfidf_start_A_{}_{}_{}',
    #'tfidf_start_B_{}_{}_{}',
    #'tfidf_context_{}_{}_{}',
    'f_max_run_A_{}_{}_{}',
    'f_max_run_B_{}_{}_{}',
    'f_min_run_A_{}_{}_{}',
    'f_min_run_B_{}_{}_{}',
    'f_typos_A_{}_{}_{}',
    'f_typos_B_{}_{}_{}',
    'f_typos_frac_A_{}_{}_{}',
    'f_typos_frac_B_{}_{}_{}',
    'f_relevant_A_{}_{}_{}',
    'f_relevant_B_{}_{}_{}',
    'f_relevant_frac_A_{}_{}_{}',
    'f_relevant_frac_B_{}_{}_{}',
]

features = []

punct_modes = ["exclude", "separate", "leave"]
lower_modes = [True, False]
stops_modes = [False, True]

for template, *preproc_mode in itertools.product(feat_templates, lower_modes, punct_modes, stops_modes):            
    features.append(template.format(*preproc_mode))

X =((data[features]).values)
X = np.stack([np.concatenate(X[i]) for i in range(X.shape[0])])

y_A = data["qualA"].values
y_B = data["qualB"].values

In [71]:
cv = cross_val_score(LGBMRegressor(n_estimators=100, num_leaves=1000), X, y_B, scoring=spearmancorr, cv=KFold(10, True, 123), verbose=3)
cv.mean(), cv.std()

[CV]  ................................................................
[CV] ................................. , score=0.689501, total=   9.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.5s remaining:    0.0s


[CV]  ................................................................
[CV] ................................. , score=0.704769, total=   8.4s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.1s remaining:    0.0s


[CV]  ................................................................
[CV] ................................. , score=0.680559, total=   7.6s
[CV]  ................................................................
[CV] ................................. , score=0.690844, total=   7.8s
[CV]  ................................................................
[CV] ................................. , score=0.529056, total=   8.6s
[CV]  ................................................................
[CV] ................................. , score=0.608166, total=   7.8s
[CV]  ................................................................
[CV] ................................. , score=0.628400, total=   8.0s
[CV]  ................................................................
[CV] ................................. , score=0.624265, total=   8.3s
[CV]  ................................................................
[CV] ................................. , score=0.762232, total=   7.9s
[CV]  

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.5min finished


(0.6590293383150746, 0.060790919738737528)

In [33]:
clf_A = LGBMRegressor(n_estimators=100, num_leaves=1000)
clf_A.fit(X, y_A)
clf_B = LGBMRegressor(n_estimators=100, num_leaves=1000)
clf_B.fit(X, y_B)

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
       fair_c=1.0, gaussian_eta=1.0, huber_delta=1.0, learning_rate=0.1,
       max_bin=255, max_depth=-1, max_drop=50, min_child_samples=10,
       min_child_weight=5, min_split_gain=0, n_estimators=100, nthread=-1,
       num_leaves=1000, objective='regression', poisson_max_delta_step=0.7,
       reg_alpha=0, reg_lambda=0, seed=0, silent=True, skip_drop=0.5,
       subsample=1, subsample_for_bin=50000, subsample_freq=1,
       uniform_drop=False, xgboost_dart_mode=False)

In [34]:
test, *_ = process_data("../data/test_20170727.json", train=False, vectorizers=vectorizers)



In [35]:
T =((test[features]).values)
T = np.stack([np.concatenate(T[i]) for i in range(T.shape[0])])

In [36]:
pred_A = clf_A.predict(T)
pred_B = clf_B.predict(T)

In [37]:
pd.DataFrame(np.stack([pred_A, pred_B]).T, index=test.index, columns=["Alice", "Bob"]).to_csv("pred.csv")

In [50]:
test[["context", "thread"]].values

array([[ 'In 1938 the Nazis altered about one-third of the toponyms of the area, eliminating, Germanizing, or simplifying a number of Old Prussian names, as well as those Polish or Lithuanian names originating from colonists and refugees to Prussia during and after the Protestant Reformation. More than 1,500 places were ordered to be renamed by 16 July 1938 following a decree issued by Gauleiter and Oberpräsident Erich Koch and initiated by Adolf Hitler. Many who would not cooperate with the rulers of Nazi Germany were sent to concentration camps and held prisoner there until their death or liberation.',
        list(['Hello, my friend. Hope this chat will go well. I will ask you a question in a second, please wait', 'What was the number of prussian names in 1938?', '100?', 'You can do better. Hint: first 3 answer letters is "old".', 'old', 'Still incorrect :( Lets speak about something else...', 'else?', 'No.', 'no?', 'No.', 'no?', 'No.', 'no?', 'No.', 'no?', 'No.', 'no?', 'No.', 'no?