In [None]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as ssd

In [None]:
import sys, os
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../", "TrendApproximation", sys.argv)

# 1. Store co-occuring words in dicts

In [None]:
experiment_dir = ph.get("experiment_dir")
time_hour_vals = ph.get("time_hour_vals")
keywords_for_eval_path = ph.get("keywords_for_eval_path")
output_dir = ph.get("distance_root_folder")
co_occur_table_file_path = "%s/occ_table.csv" % experiment_dir

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
co_occur_df = pd.read_csv(co_occur_table_file_path, sep="|")

### Co-occuring words are in these columns

In [None]:
word_cols = range(1,200,2)

In [None]:
len(co_occur_df)

In [None]:
co_occur_df.head()

# 2. Calculating Jaccard and cosine distance

In [None]:
import multiprocessing, functools

def get_word_co_occurance_matrix(df, snapshot_id):
    dict_for_snapshot = dict()
    snapshot_df = co_occur_df[co_occur_df["start_time"]==snapshot_id]
    for index, row in snapshot_df.iterrows():
        #row = co_occur_df.ix[2]
        key, time = row["key_word"], row["start_time"]
        is_null_row = pd.isnull(row)
        co_occ_dict = dict()
        for idx in word_cols:
            word, count = row[str(idx)], row[str(idx+1)]
            if is_null_row[str(idx)]:
                break
            elif word == key:
                continue
            else:
                co_occ_dict[word] = count
        #print(co_occ_dict, len(co_occ_dict))
        if len(co_occ_dict) > 0:
            dict_for_snapshot[key] = co_occ_dict
    #print(len(dict_for_snapshot), len(snapshot_df))
    repr_df = pd.DataFrame(dict_for_snapshot).T
    repr_df = repr_df.fillna(0.0)
    return repr_df

def get_distance(repr_df, dist_type, w1, w2, verbose=False):
    dist = None
    try:
        a = repr_df.ix[w1]
        b = repr_df.ix[w2]
        if dist_type == "jaccard":
            bool_a = a > 0.0
            bool_b = b > 0.0
            dist = ssd.jaccard(bool_a, bool_b)
        elif dist_type == "cosine":
            dist = ssd.cosine(a,b)
        else:
            raise RuntimeError("Invalid distance type!")
    except KeyError as ke:
        if verbose:
            print("KeyError: %s" % ke)
    except:
        raise
    finally:
        return w1, w2, dist

def get_distance_toplist(repr_df, dist_type, w1, top_k=100, n_threads=1):
    word_list = list(repr_df.index)
    if len(word_list) > 0:
        try:
            # @OstapenkoFC is not present
            if w1 in word_list:
                word_list.remove(w1)
            res = []
            for w2 in word_list:
                res.append(get_distance(repr_df, dist_type, w1, w2))
        except ValueError:
            print(w1, word_list)
            raise
        except:
            raise
        return pd.DataFrame(res, columns=["word_1","word_2","distance"]).sort_values("distance").head(top_k)
    else:
        return pd.DataFrame([], columns=["word_1","word_2","distance"])
    
def get_toplist_for_key_words(co_occur_df, dist_type, query, top_k=100, n_threads=1):
    snapshot_id, key_words = query
    snapshot_df = get_word_co_occurance_matrix(co_occur_df, snapshot_id)
    toplists = []
    for kw in key_words:
        toplists.append(get_distance_toplist(snapshot_df, dist_type, kw, top_k=top_k, n_threads=n_threads))
    res = pd.concat(toplists)
    res["snapshot_id"] = snapshot_id
    return res

def get_toplist_for_multiple_query(co_occur_df, dist_type, queries, max_threads):
    if max_threads == 1:
        res = []
        for q in queries:
            res.append(get_toplist_for_key_words(co_occur_df, dist_type, q))
    else:
        f_partial = functools.partial(get_toplist_for_key_words, co_occur_df, dist_type)
        pool = multiprocessing.Pool(processes=max_threads)
        res = pool.map(f_partial, queries)
        pool.close()
        pool.join()
    return pd.concat(res)

## a.) Load keywords for the examined days

### 1. Setting keywords for player co-occurence

In [None]:
keywords_df_1 = pd.read_csv(keywords_for_eval_path, sep="|", names=["date","key_words"])
keywords_df_1

### 2. Setting keywords for "play" and "match" words

In [None]:
days = ["2017-05-%.2i" % i for i in range(28,32)] + ["2017-06-%.2i" % i for i in range(1,12)]
keywords_df_2 = pd.DataFrame(list(zip(days, ["{'play', 'match'}" for i in range(len(days))])), columns=["date","key_words"])
keywords_df_2

In [None]:
keywords_df = pd.concat([keywords_df_1, keywords_df_2])

## b.) Filter big table for these days

In [None]:
co_occur_df["date"] = co_occur_df["start_time"].apply(lambda x: x.split("T")[0])

In [None]:
co_occur_df = co_occur_df[co_occur_df["date"].isin(list(keywords_df["date"].unique()))]

## c.) Query distances (Jaccard, Cosine)

In [None]:
queries = []
for idx, row in keywords_df.iterrows():
    queries += [("%sT%.2i:00" % (row["date"],h),list(eval(row["key_words"]))) for h in time_hour_vals]
len(queries)

# Many keywords were missing: but of course at midnight players tend to be mentioned less often: '@OstapenkoFC' missing almost all the time

### Speedup?

   * 3min 2s ha 1 szál
   * 5 szál: másolja a nagy adatot is ez így nem lesz jó!!! sok memóriát eszik:
      * bár sokkal hamarabb kész lett: 1min 24s

In [None]:
%%time
jaccard_res = get_toplist_for_multiple_query(co_occur_df, "jaccard", queries, max_threads=20)

In [None]:
%%time
cosine_res = get_toplist_for_multiple_query(co_occur_df, "cosine", queries, max_threads=20)

# 4. Export distances

In [None]:
def export_result(f_name, df):
    out_df = df[~df["distance"].isnull()]
    print(len(out_df))
    out_df.to_csv(f_name, sep="|", index=False)

In [None]:
export_result("%s/jaccard.dist" % output_dir, jaccard_res)

In [None]:
export_result("%s/cosine.dist" % output_dir, cosine_res)