In [1]:
import ipywidgets as widgets
import json
from ipywidgets import interact, interact_manual
from IPython.display import HTML, display
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import preprocessing
import sys
import matplotlib.pyplot as plt
import xgboost as xgb
import shap
from sklearn import metrics
import json
from datetime import date
import itertools as it
from sklearn.model_selection import KFold
display(HTML('''<style>
    .widget-label { min-width: 20ex !important; }
</style>'''))

DATASETS_DIR = "../datasets/"

# w = widgets.Checkbox(
#    value=False,
#    description='Check me',
#    disabled=False,
#    indent=False
# )
# display(w)
# print(w.value)
ranking = None
df = None
params = None
df_text = None


def get_data(params):
    global df
    global df_text

    print("1/3 Loading post texts started")
    df_text = pd.read_csv(DATASETS_DIR+"id_to_text.csv")

    prepend_csv = "prepend_done.csv"
    standalone_csv = "standalone_done.csv"

    print("2/3 Loading post features started")
    if params["title_prepend"]:

        df = load_wo_cols(DATASETS_DIR+prepend_csv, params)
    elif df:
        df = load_wo_cols(DATASETS_DIR+standalone_csv, params)

    if params["norm"] < 2:
        type_to_drop = "_abs" if params["norm"] == 1 else "_norm"

        df = df[df.columns.drop(list(df.filter(regex=type_to_drop)))]

    keys = ["info", "yta", "nah", "esh", "nta"]
    weight = "weighted_" if params["weighted"] else ""
    values = ["reactions_"+weight+k.upper() for k in keys]
    acros = dict(zip(keys, values))

    dfs = []
    if params["topics_separate"] > 0:

        topic_min = df["topic_nr"].min()
        topic_max = df["topic_nr"].max()
        #print(f"Data split by topic ({topic_min}, {topic_max})")

        for i in range(topic_min, topic_max+1):
            dfs.append(df.loc[df["topic_nr"] == i])
    else:
        dfs = [df]

    return dfs, acros


def load_wo_cols(path, params, remove_cols=[], verbose=False):
    cols_to_remove = ["post_text", "Unnamed: 0", "Unnamed: 1", "Unnamed: 2", "Unnamed: 0.1",
                      "Unnamed: 0.1.1", "liwc_post_id", "foundations_post_id",
                      "foundations_title_post_id", "liwc_title_post_id", "post_created_utc"]+remove_cols
    metadata = ["speaker_account_comment_karma", "post_num_comments", "speaker_account_age",
                "speaker_account_link_karma", "post_ups", "post_downs", "post_score", "reactions_is_devil", "reactions_is_angel", "post_ratio"]
    # removed "post_ratio" from metadata b.c. used for weights

    removed = []
    df = pd.read_csv(path, nrows=10)
    cols_to_read = list(df.columns)

    # remove metadata
    if params["wo_metadata"]:
        cols_to_remove = cols_to_remove+metadata

    # remove liwc
    if not params["use_liwc"]:
        cols_to_remove = cols_to_remove + \
            list(filter(lambda x: "liwc_" in x, cols_to_read))

    # remove moral foundations
    if not params["use_mf"]:
        cols_to_remove = cols_to_remove + \
            list(filter(lambda x: "foundations_" in x, cols_to_read))

    # post requirements setup
    cols_to_remove = [
        x for x in cols_to_remove if x not in list(params["requirements"].keys())]

    if verbose:
        print(cols_to_read)
    for col in cols_to_remove:
        if col in cols_to_read:
            cols_to_read.remove(col)
            removed.append(col)

    #print(f"Removed {removed} from {path.split('/')[-1]}")
    #print("ONLY USING 10k lines")
    df = pd.read_csv(path, usecols=cols_to_read,) #TODO: FIXME

    # delte posts that don't meet requirements
    nr_rows_pre_req = len(df)
    for k, v in params["requirements"].items():
        df = df.loc[(df[k] >= v), :]
    # remove cols required for "requirements"
    if params["wo_metadata"]:
        to_drop = set(list(params["requirements"].keys()))
        in_list = set(list(df.columns))
        will_drop = list(to_drop.intersection(in_list))
        df = df.drop(columns=will_drop)
        removed += will_drop

    # print(
    #    f"Removed {int(100*(nr_rows_pre_req-len(df))/len(df))}% due to requirements, Now {len(df)} posts remain.")
    # Check values in df
    # df.describe().loc[['min','max']].to_csv("min_max.csv",index=False)
    return df


def sampling(X_train, y_train, params, indices=[], verbose=False):
    df_len_old = len(X_train)
    pritn("HEEEEELLLLOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
    print(f"{params['sampling']}-sampling for {params['predict']}")
    if verbose:
        print(f"{params['sampling']}-sampling for {params['predict']}")

    if params["sampling"] == "none":
        X_train_ret = X_train
        y_train_ret = y_train

    if verbose:
        print("Original Y distribution on training set")
        _ = plt.hist(y_train, bins='auto')
        plt.show()

    if params["predict"] == "ratio":
        if params["sampling"] == "up":
            raise Exception("Upsampling with regression is not feasible☹️")
        elif params["sampling"] == "down":
            # downsampling
            print("Downsampling")
            bucket_ranges = [x/10 for x in list(range(0, 11))]
            bucket_counter = []

            X_train_tmp = X_train
            y_train_tmp = y_train.reshape((len(y_train), 1))
            dummy_feat_name = [str(int) for int in range(X_train_tmp.shape[1])]
            feat_names_to_sample = dummy_feat_name+["Y"]
            data_to_sample = np.append(X_train_tmp, y_train_tmp, 1)
            df_to_sample = pd.DataFrame(
                data_to_sample, columns=feat_names_to_sample)

            # Get bucket sizes
            for i in range(len(bucket_ranges)):
                if bucket_ranges[i] == 1:
                    continue
                orig_size = len(df_to_sample.loc[(bucket_ranges[i] <= df_to_sample['Y']) & (
                    df_to_sample['Y'] <= bucket_ranges[i+1])])
                bucket_counter.append(orig_size)

            # We only downsample buckets that are > 2* bucket mean => 2*bucket mean
            bucket_max = int(np.mean(bucket_counter)*1.5)
            for j in range(len(bucket_counter)):
                if bucket_counter[j] > bucket_max:
                    if verbose:
                        print(
                            f"Bucket {bucket_ranges[j]}-{bucket_ranges[j+1]} has {bucket_counter[j]}>{bucket_max}")
                    df_bkt = df_to_sample.loc[(bucket_ranges[j] <= df_to_sample['Y']) & (
                        df_to_sample['Y'] <= bucket_ranges[j+1])]
                    df_bkt_smpl = df_bkt.sample(
                        n=max(int(bucket_max), len(df_bkt)), replace=False, random_state=42)
                    df_to_sample.loc[(bucket_ranges[j] <= df_to_sample['Y']) & (
                        df_to_sample['Y'] <= bucket_ranges[j+1])] = df_bkt_smpl

            df_to_sample = df_to_sample.dropna()
            y_train = df_to_sample["Y"]
            df_to_sample = df_to_sample.drop(columns=["Y"])

            X_train = df_to_sample.to_numpy()
            X_train_ret = X_train
            y_train_ret = y_train

    elif params["predict"] == "class":
        df_y = pd.DataFrame(data={"Y": y_train})

        if len(indices) > 0:
            if verbose:
                print(f"Using {len(indices)} indices")
        else:
            indices = range(len(indices))

        # Get list of indices for classes that are in the indices array
        c0_idx = pd.Series(df_y.loc[df_y["Y"] == 0].index.values)
        c0_idx = c0_idx[c0_idx.isin(indices)]
        c1_idx = pd.Series(df_y.loc[df_y["Y"] == 1].index.values)
        c1_idx = c1_idx[c1_idx.isin(indices)]

        if verbose:
            print(f"    Y=0: {c0_idx.shape}")
            print(f"    Y=1: {c1_idx.shape}")

        if params["sampling"] == "up":
            # upsample
            if len(c0_idx) >= len(c1_idx):
                n = len(c0_idx)
                c1_idx_sampeled = c1_idx.sample(
                    n=n, random_state=1, replace=len(c1_idx) < n).values
                c0_idx_sampeled = c0_idx.values
                if verbose:
                    print(f"Upsampling Y=1 with {n} samples")

            elif len(c0_idx) < len(c1_idx):
                n = len(c1_idx)
                c0_idx_sampeled = c0_idx.sample(
                    n=n, random_state=1, replace=len(c0_idx) < n).values
                c1_idx_sampeled = c1_idx.values
                if verbose:
                    print(f"Upsampling Y=0 with {n} samples")

        elif params["sampling"] == "down":
            # downsample
            if len(c0_idx) >= len(c1_idx):
                n = len(c1_idx)
                c0_idx_sampeled = c0_idx.sample(
                    n=n, random_state=1, replace=len(c0_idx) < n).values
                c1_idx_sampeled = c1_idx.values
                if verbose:
                    print(f"Downsampling Y=0 with {n} samples")
            elif len(c0_idx) < len(c1_idx):
                n = len(c0_idx)
                c1_idx_sampeled = c1_idx.sample(
                    n=n, random_state=1, replace=len(c1_idx) < n).values
                c0_idx_sampeled = c0_idx.values
                if verbose:
                    print(f"Downsampling Y=1 with {n} samples")
        else:
            c0_idx_sampeled = c0_idx
            c1_idx_sampeled = c1_idx

        all_idx = np.concatenate((c0_idx_sampeled, c1_idx_sampeled), axis=0)

        if verbose:
            df_tmp = df_y.iloc[all_idx]
            print(f"   Y=0: {len(df_tmp.loc[df_tmp['Y']==0])}")
            print(f"   Y=1: {len(df_tmp.loc[df_tmp['Y']==1])}")

        X_train_ret = X_train[all_idx, :]
        y_train_ret = y_train[all_idx]

    # print(df_len_old)
    #print(f"Removed/Added {int(100*(df_len_old-len(y_train_ret))/len(y_train_ret))}% due to Sampling, Now {len(y_train_ret)} posts remain.")
    return X_train_ret, y_train_ret

def opposite_jdgmt(judg):
    if "NTA" in judg:
        rtn = judg.replace("NTA", "YTA")
    elif "NAH" in judg:
        rtn = judg.replace("NAH", "ESH")
    elif "YTA" in judg:
        rtn = judg.replace("YTA", "NTA")
    elif "ESH" in judg:
        rtn = judg.replace("ESH", "NAH")
    elif "INFO" in judg:
        rtn = judg

    return rtn+"_neg_vals"


# mapping is either "clip", meaning negative votes are just set to 0, or "oppossite", meaning we use the mapping table in "opposite_jdgmt"
def map_negative_values(df, acros, mapping="clip"):

    if mapping == "opposite" or mapping == "map":
        print("Map = opposite")
        for k in acros.keys():
            acr = acros[k]
            df[acr] = df[acr] + (-1*df[opposite_jdgmt(acr)])
    return df


def get_data_classes(df, acros, ratio=0.5, verbose=False, predict="class", judgement_weighted=True, mapping="clip", save_excerpt=False):
    if verbose:
        print(f"df original shape {df.shape}")

    n_rows_old = len(df)

    # Map negative judgements to opposing judgement, if we are not simply counting each comment as one vote (i.e. if judgement_weighted = True)
    # i.e. YTA<->NTA, ESH<->NAH
    if judgement_weighted:
        df = map_negative_values(df, acros, mapping=mapping)

    if predict == "class":
        # We only look at YTA and NTA
        df["YTA_ratio"] = df[acros["yta"]] / \
            (df[acros["info"]] + df[acros["yta"]] +
             df[acros["nah"]]+df[acros["esh"]]+df[acros["nta"]])

        # drop all rows where the majority is not YTA or NTA
        df = df.loc[((df[acros["yta"]] > df[acros["info"]]) & (df[acros["yta"]] > df[acros["nah"]]) & (df[acros["yta"]] > df[acros["esh"]])) | (
            (df[acros["nta"]] > df[acros["info"]]) & (df[acros["nta"]] > df[acros["nah"]]) & (df["reactions_weighted_NTA"] > df[acros["esh"]]))]
        if verbose:
            print(f"Drop all rows where majority is not YTA or NTA {df.shape}")

        # drop all rows that are not "extreme" enough
        df = df.loc[(1-ratio <= df["YTA_ratio"]) | (df["YTA_ratio"] <= ratio)]

        # specifc classes & drop unnecesarry
        # YTA = Class 1, NTA = class 0
        df["Y"] = np.where(df[acros["yta"]] > df[acros["nta"]], 1,  0)
        smp_weights = None
        if verbose:
            print(df.shape)

    elif predict == "ratio":
        # Y = asshole ratio(AHR) = (YTA+ESH)/(YTA+ESH+NTA+NAH)
        # drop posts w.o. votes
        tmp = df[acros["yta"]] + df[acros["nah"]] + \
            df[acros["esh"]]+df[acros["nta"]]
        tmp = tmp[tmp != 0]
        tmp = (df[acros["yta"]]+df[acros["esh"]])/tmp
        df["Y"] = tmp
        df = df[df["Y"].notna()]

        n_rows_old = len(df)
        df = df.loc[(1-ratio <= df["Y"]) | (df["Y"] <= ratio)]  # MODIFY ME
        smp_weights = None

    if np.min(df["Y"]) < 0 or np.max(df["Y"]) > 1:
        raise Exception("Y value should be in range [0,1]")

    # get Y values for specific posts
    # change me if you want different y_values
    posts_to_get = ["eq3k7y","mj8a47","gcmhy1","kcf1e0","bs50ps","gcti52","cmkl5l","bd0ww1","b6uiz7","bt8mm5","blexov","aos6vn","hv6xro","ggyb2v","b1cbcz","dn6075","bu2kf5","hql2q4","cjudzm","j0do3l"]
    y_vals = df[df['post_id'].isin(posts_to_get)][["Y","post_id",acros["yta"], acros["nta"], acros["nah"], acros["esh"]]]
    y_vals.to_excel("y_vals.xlsx")

    # get list of all columns that contain uppercase vote acronym
    vote_acroynms = list(filter(lambda x: any(
        [acr.upper() in x for acr in list(acros.keys())]), list(df.columns)))
    vote_acroynms += ["post_id"]
    df = df.drop(columns=vote_acroynms)

    if verbose:
        print(df.shape)

    if save_excerpt:
        df.head(2).to_csv("../post_modification/data/prepend_done_trained_feats.csv", index=False)
        
    X = df.drop(columns=["Y"])
    y = df["Y"].to_numpy()

    feat_name_lst = list(X.columns)

    # scalings
    scaler = preprocessing.StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    return X_scaled, y, feat_name_lst, None  # smp_weights.to_numpy()


def get_train_test_split(params, grid_search=False, verbose=False):
    dfs, acros = get_data(params)

    df = dfs[0]
    if len(dfs) > 1:
        print("MORE THAN 1 df")

    df_cpy = df.copy()
    X, y, feat_name_lst, smp_weights = get_data_classes(df_cpy, ratio=params["ratio"], acros=acros, predict=params["predict"], judgement_weighted=params["weighted"],
                                                        mapping=params["mapping"], verbose=False)
    if grid_search:
        print("YOU SURE YOU WANT TO BE DOING THIS?")
        return X, y, feat_name_lst

    train, test = train_test_split(
        range(len(X)), test_size=0.33,)  # use ranomd_state=42 for repeatable values

    X_train, y_train = sampling(
        X[train], y[train], params, indices=train if params["predict"] == "class" else [], verbose=False)

    X_test = X[test, :]
    y_test = y[test]

    if params["random_y"]:
        # Sanity check, i.e. get results for random predition
        #df["Y"] = np.random.randint(0, 1001, size=len(df["Y"]))/1000

        y_test_sum_old = np.sum(y_test[:len(y_test*0.5)])
        np.random.shuffle(y_test)
        y_test_sum_new = np.sum(y_test[:len(y_test*0.5)])
        # if y_test_sum_old == y_test_sum_new:
        #    print("Not truly random values")
        if verbose:
            print(f"USING RANDOM Y\n Was {y_test_sum_old} Is {y_test_sum_new}")

    return X_train, y_train, X_test, y_test, feat_name_lst, test


def get_clf_name(params, clf_type):
    clf_name = clf_type
    for k, v in params.items():
        if isinstance(v, bool) and v:
            clf_name += f"_{k}"
        else:
            clf_name += f"_{k}={v}"
    return clf_name


def get_metrics(y_test, y_pred, params, verbose=True):
    if params["predict"] == "class":
        # testing score
        f1_test = metrics.f1_score(y_test, y_pred, average="weighted")
        acc_test = metrics.accuracy_score(y_test, y_pred)

        if verbose:
            print(f"    Accuracy: {acc_test}\n    F1: {f1_test}")
            print(classification_report(y_test, y_pred, target_names=[
                "Class 0: low AH", "Class 1: high AH"]))
        return f1_test

    elif params["predict"] == "ratio":
        mean_abs = metrics.mean_absolute_error(y_test, y_pred)
        mean_sqr = metrics.mean_squared_error(y_test, y_pred)
        rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)

        if verbose:
            print(
                f"    Mean absolute: {mean_abs}\n    Mean squared: {mean_sqr}\n    Root Mean Squared: {rmse}")
        return mean_abs


@interact(show_data_params=False, )
def show_params(show_data_params):
    global params
    if params == None:
        return
    clf_type = "Regression" if params["predict"] == "ratio" else "Classification"
    if show_data_params:
        print(
            f"We are using these dataset parameters for {clf_type}:\n{json.dumps(params, indent=4)}")


@interact_manual(Prediction_Type=['Regression', "Classification"], Nr_features_to_show=(10, 169, 1), ignore_certain_feats=[True, False])
def training(Prediction_Type, Nr_features_to_show, ignore_certain_feats,):
    print("Changing Prediction_Type or Nr_features_to_show will retrain the model & generate new feature importance. This will take approx: 1-4min.")
    global ranking
    global params

    regression_params = {
        "norm": 1,
        "weighted": True, 
        "title_prepend": True,
        "sampling": "none",#this param no longer does anything
        "topics_separate": False,
        "predict": "ratio",
        "mapping": "opposite", "ratio": 0.5,
        "wo_metadata": True,
        "use_liwc": True,
        "use_mf": True,
        "requirements": True,
        "random_y": False
    }

    classification_params = {
        "norm": 1,
        "weighted": True,
        "title_prepend": True,
        "sampling": "none",
        "topics_separate": False,
        "predict": "class",
        "mapping": "opposite",
        "ratio": 0.3,
        "wo_metadata": True,
        "use_liwc": True,
        "use_mf": True,
        "requirements": True,
        "random_y": False
    }

    post_requirements = {  # requirement: key >= value in post
        "post_num_comments": 10,
        "post_score": 10,
        "post_ratio": 0.7,
    }

    features_to_ignore = ['foundations_SemiC', 'foundations_Quote', 'foundations_Colon', 'foundations_Sixltr', 'foundations_Parenth', 'foundations_Dash', "topic_nr"]
    params = regression_params if Prediction_Type == "Regression" else classification_params
    if params["requirements"]:
        params["requirements"] = post_requirements
    else:
        params["requirements"] = dict.fromkeys(post_requirements, 0)

    # Setup Model
    # scale_pos_weight
    hyper_params_reg = {'learning_rate': 0.2257949690293526,
                        'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 180}
    # hyper_params_reg_new =
    hyper_params_clf = {'learning_rate': 1.0835557461256857, 'max_depth': 5,
                        'min_child_weight': 3, 'n_estimators': 420, }  # 'scale_pos_weight':18372/5581}
    clf = xgb.XGBClassifier(verbosity=0, random_state=42, use_label_encoder=False, **hyper_params_clf) if params["predict"] == "class" else xgb.XGBRegressor(
    verbosity=0, random_state=42, **hyper_params_reg)

    #hyper_params_reg_rf = {'max_depth': 5, 'min_samples_leaf': 1,
    #                       'min_samples_split': 2, 'n_estimators': 180}
    #clf = RandomForestClassifier(random_state=42) if params["predict"] == "class" else RandomForestRegressor(
    #    random_state=42, **hyper_params_reg_rf)
    classifiers = (clf, "xgboost")
    #classifiers = (clf, "rf")
    clf_name = get_clf_name(params, classifiers[1])

    dfs, acros = get_data(params)
    cur_df_0 = dfs[0]

    if ignore_certain_feats:
        print("DROPPING GENERATED FEATURES:")
        print(features_to_ignore)
        cur_df_0 = cur_df_0.drop(columns=features_to_ignore)
        

    X, y, feat_name_lst, smp_weights = get_data_classes(cur_df_0, ratio=params["ratio"], acros=acros, predict=params["predict"], judgement_weighted=params["weighted"],
                                                        mapping=params["mapping"], verbose=False)

    print(f"X SHAPE {X.shape}")

    list_shap_values = list()
    list_test_sets = list()
    list_scores = list()
    splits = 4

    if params["predict"] == "class":
        X = X[:-1]
    if len(X) % splits != 0:
        print("NOT DIVISIBLE BY SPLIT. WILL GET ERROR IN SHAP."+str(len(X)))

    kf = KFold(n_splits=splits, shuffle=True)
    list_shap_values = list()
    list_test_sets = list()
    print(f"Model Training started. We will train {splits} models")
    counter = 0
    for train_index, test_index in kf.split(X):
        print(f"Training model {counter+1}/{splits}")
        counter += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = pd.DataFrame(X_train, columns=feat_name_lst)
        X_test = pd.DataFrame(X_test, columns=feat_name_lst)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        if ignore_certain_feats:
            today = date.today()
            d1 = today.strftime("%d.%m.%Y")
            clf.save_model(f"../post_modification/data/{classifiers[1]}{d1}.json")

        score = get_metrics(y_test, y_pred, params, verbose=False)
        list_scores.append(score)
        explainer = shap.TreeExplainer(clf)
        shap_values = explainer.shap_values(X_test)
        # for each iteration we save the test_set index and the shap_values
        list_shap_values.append(shap_values)
        list_test_sets.append(test_index)

    # combining results from all iterations
    test_set = list_test_sets[0]
    shap_values = np.array(list_shap_values[0])

    for i in range(1, len(list_test_sets)):
        test_set = np.concatenate((test_set, list_test_sets[i]), axis=0)
        shap_values = np.concatenate(
            (shap_values, np.array(list_shap_values[i])), axis=0)
    # bringing back variable names
    X_test = pd.DataFrame(X[test_set], columns=feat_name_lst)
    print(f"Mean score: {np.mean(np.array(list_scores))}")
    if ignore_certain_feats:
        print("Updated model and feature csv extract in post modification tools")
    shap.summary_plot(shap_values, X_test, max_display=Nr_features_to_show,show=False)
    plt.savefig('summary.png', bbox_inches='tight')


    #key = "class" if params["predict"] == "class" else "ratio"

    #print(f'{"SENSIBLE" if "sensible" in params_i else "BEST"}, {key.upper()}')
    #print(f'{"F1" if params_i["predict"] == "class" else "ME" }: {score}')
    # print(clf_name)

    #shap.summary_plot(shap_values, X_train, feature_names=feat_name_lst, max_display=Nr_features_to_show)
    
    shap_df = pd.DataFrame(shap_values, columns=feat_name_lst)
    vals = np.abs(shap_df.values).mean(0)
    shap_importance = pd.DataFrame(list(zip(feat_name_lst, vals)), columns=[
                                   'col_name', 'feature_importance_vals'])
    shap_importance.sort_values(
        by=['feature_importance_vals'], ascending=False, inplace=True)
    ranking = shap_importance["col_name"].to_list()

    # display(HTML(shap_importance.to_html()))


interactive(children=(Checkbox(value=False, description='show_data_params'), Output()), _dom_classes=('widget-…

interactive(children=(Dropdown(description='Prediction_Type', options=('Regression', 'Classification'), value=…

In [1]:
import json
import ipywidgets as widgets
default_hist_bins = 50
max_samples = 20

 
f = open('../../feature_explanation.json')
data = json.load(f)
data['writing_sty_"_count'] = data.pop("writing_sty_'_count") #quaotation mark hack

Post_sample_from_bin_widget = widgets.IntSlider(
    min=0, max=default_hist_bins-1, step=1, value=5,
    style={'description_width': 'initial'}, layout = widgets.Layout(width='500px'))

Nr_histogram_bins_widget = widgets.IntSlider(
    min=10, max=500, step=10, value=default_hist_bins, 
    style={'description_width': 'initial'}, layout = widgets.Layout(width='500px'))

Nr_samples_to_get_widget = widgets.IntSlider(
    min=1, max=max_samples, step=1, value=1, 
    style={'description_width': 'initial'}, layout = widgets.Layout(width='500px'))

def update_max_sample_idx(*args):
    Post_sample_from_bin_widget.max = Nr_histogram_bins_widget.value-1

    
@interact(Show_param_explanation=True, )
def show_param_explanation(Show_param_explanation):
    if Show_param_explanation:
        print("Parameter explantion:")
        print("    feature_to_analyse: Which feature we want to get a post_text sample from")
        print("    Amount_histo_bins: How many bins the histogram should use. The more bins the more fine grained.")
        print("    Bin_index_to_sample: From which histogram bin we want to get a post sample.")
        print("    Nr_samples_to_get: How many post samples we want to get. Max is 20.")
        print("    Reproducible_random_state: If we always want to get completely random samples within a bin or if they should be reproducible.")
    else:
        print("Parameter explantion hidden")

def visualiser(feature_to_analyse, Amount_histo_bins, Bin_index_to_sample, Nr_samples_to_get, Reproducible_random_state):
    global df_text
    p = df[feature_to_analyse].plot(kind='hist', bins=Amount_histo_bins, color='blue')
    p.patches[Bin_index_to_sample].set_color('orange')
    plt.show()
    
    truncated = feature_to_analyse.replace("_norm", "")
    truncated = truncated.replace("_abs","")
    
    print("Feature Explanation:")
    if "liwc_" in feature_to_analyse:
        print(f"  {feature_to_analyse} = See https://drive.google.com/file/d/1EHrlt6KcL3jZ5gFAA1vjKd5GdXKLRWmk/view?usp=sharing for a feature explanation")
    elif "foundations_" in feature_to_analyse:
        print(f"  {feature_to_analyse} = See https://moralfoundations.org/other-materials/ for a feature explanation")
    elif truncated in data:
        print(f"  {feature_to_analyse} = {data[truncated]}")
    else:
        print(f"  {feature_to_analyse} = Feature explanation not found")
              
    min_v = df[feature_to_analyse].min()
    max_v = df[feature_to_analyse].max()
    
    bin_mins = np.linspace(min_v, max_v, num=Amount_histo_bins,endpoint=False)
    sample_bin_min = bin_mins[Bin_index_to_sample]
    sample_bin_max = max_v if Bin_index_to_sample+1 >= Amount_histo_bins else bin_mins[Bin_index_to_sample+1]
    
    df_sample = df.loc[((sample_bin_min<=df[feature_to_analyse]) & (df[feature_to_analyse]<=sample_bin_max))]
    nr_samples = len(df_sample)
    print(f"   Inspecting values from {sample_bin_min} to {sample_bin_max}. Bucket has {nr_samples} samples")
    
    
    if nr_samples>=Nr_samples_to_get:
        Nr_samples_to_get_widget.max = min(nr_samples, max_samples)
        smpl = df_sample['post_id'].sample(n=Nr_samples_to_get, random_state=42 if Reproducible_random_state else None)
        df_text_tmp = df_text.loc[df_text["post_id"].isin(smpl)]
        
        for i in range(len(df_text_tmp)):
            post_id = df_text_tmp.iloc[i]["post_id"]
            post_text = df_text_tmp.iloc[i]["post_text"]
            print(f"\nPOST ID: {post_id}")
            print(f"POST TEXT:\n{post_text}")
            print("-----------------------")
            
    else:
        print(f"Not enough samples in this bucket. Wanted {Nr_samples_to_get}, but there are only {nr_samples}") #this should never happen
        
Nr_histogram_bins_widget.observe(update_max_sample_idx, 'value')    

interact(visualiser, feature_to_analyse=ranking, 
         Amount_histo_bins=Nr_histogram_bins_widget,
         Bin_index_to_sample=Post_sample_from_bin_widget, 
         Nr_samples_to_get=Nr_samples_to_get_widget,
         Reproducible_random_state=False);
                                             


NameError: name 'interact' is not defined

# Feature explanations
# Title features
When using the **standalone** dataset all feautres that start with "title_" are generated by ONLY looking at the post title. 
All these features exist also without the "title_" prefix which means they were generated by ONLY looking at the post text.

When using the **prepend** dataset we never have the "title_" prefix since in this case we always prepend the post title to the post text. 

# LIWC feature
All features containg "liwc_" were generated by the LIWC software. See [here](https://drive.google.com/file/d/1EHrlt6KcL3jZ5gFAA1vjKd5GdXKLRWmk/view?usp=sharing) for a list of all LIWC features and their explanations.

# Moral Foundations features
All features containg "foundations_" were generated by using the moral foundations dictionary found [here](https://moralfoundations.org/other-materials/) in the LIWC software. It contains 11 features (plus some features that are always automaticall appended by LIWC). Namely, HarmVirtue, HarmVice, FairnessVirtue, FairnessVice, IngroupVirtue, IngroupVice, AuthorityVirtue, AuthorityVice, PurityVirtue, PurityVice, MoralityGeneral,

# Speaker features
All features containing "speaker_":
'speaker_author_age' = Age of the speaker/poster. Extracted by looking for expressions like "My (25, F) boyfirend (33, M) went to get..."

'speaker_author_gender' = Gender of the speaker/poster. Extracted by looking for expressions like "My (25, F) boyfirend (33, M) went to get..."

'speaker_account_age' = Reddit account age of the poster in days.

'speaker_account_comment_karma' = Reddit account comment karma. See [here](https://www.reddit.com/r/NoStupidQuestions/comments/2idfhk/what_is_link_karma/)

'speaker_account_link_karma' = Reddit account link karma. See [here](https://www.reddit.com/r/NoStupidQuestions/comments/2idfhk/what_is_link_karma/)

# Topic number:
Which topic number was assigned to the post. Value is nominal and does not have any meaning excpet for the -1 value which indicates that it was not assigned a topic, because it did not fit well enough into one. 

# Post features
All features containing "post_": 
'post_id' = id of the post as a string.

'post_num_comments' = Number of comments this post got.

'post_score' = Score of a post. 

'post_ratio' = Upvote ratio of a post

'post_ups' = Amount of post upvotes (approximated using post_score and post_ratio)

'post_downs' = Amount of post upvotes (approximated using post_score and post_ratio)

# Reaction features
All features containing "reactions_". Generally means features that describe how people on reddit reacted to the post (excluding up/down votes).

'reactions_is_angel' = Whether the post was crossposted to r/AmITheAngel, which would indicate that the poster is acting morally. 1=True, 0=False

'reactions_is_devil' = Whether the post was crossposted to r/AmITheDevil, which would indicate that the poster is acting morally. 1=True, 0=False

'reactions_YTA' = Amount of comments that wrote YTA.

'reactions_NTA' = Amount of comments that wrote NTA.


'reactions_INFO' = Amount of comments that wrote INFO.

'reactions_ESH' = Amount of comments that wrote ESH.

'reactions_NAH' = Amount of comments that wrote NAH.

'reactions_weighted_YTA' = Amount of comments that wrote YTA multiplied by the comment score.

'reactions_weighted_NTA' = Amount of comments that wrote NTA multiplied by the comment score.

'reactions_weighted_INFO' = Amount of comments that wrote INFO multiplied by the comment score.

'reactions_weighted_ESH' = Amount of comments that wrote ESH multiplied by the comment score.

'reactions_weighted_NAH' = Amount of comments that wrote NAH multiplied by the comment score.


# Writing Style features
## Normalised and absolute values
Normalised values are marked with the "norm" suffix wherease absolute values have the "abs" suffix. (LIWC values are always normalised between 0-100. Ours are normalised between 0-1)

## Symbol counting
'writing_sty_!_count' = How often the character ! occures in the post.

'writing_sty_"_count' = How often the character " occures in the post.

'writing_sty_?_count' = How often the character ? occures in the post.

## Would I be the asshole (hypothetical)
'writing_sty_is_wibta' = Whether the post was a hypothetical and posted with the text "would I be the asshole". Extracted with string matching.

## Emotions contained within a post.
Which emotions + negativity & postitivey were used in the post. Uses the EmoLex dictionary from [here](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). Thus uses string matching like LIWC. Is divided by the number of words in the text if normalised.


'writing_sty_negative' = words associated with negative emotions

'writing_sty_trust' = words associated with the emotion trust

'writing_sty_joy' = words associated with the emotion joy

'writing_sty_positive' = words associated with positive emotions

'writing_sty_fear' = words associated with the emotion fear

'writing_sty_anger' = words associated with the emotion anger

'writing_sty_disgust' = words associated with the emotion disgust

'writing_sty_surprise' = words associated with the emotion surprise

'writing_sty_sadness' = words associated with the emotion sadness

'writing_sty_anticipation' = words associated with the emotion anticipation

## AITA location
Where the question Am I the asshole appears within the text as a percentage of the text. I.e. at the begging of the text 0, at then end 1.
Also matches WIBTA and a few different spellings. 


'writing_sty_aita_count' = How often the question AITA/WIBTA appears in the post

'writing_sty_aita_avg_location' = The average location of AITA/WIBTA [0,1]. Is -1 if it never appears.

'writing_sty_aita_fst_location' = The fist location of AITA/WIBTA [0,1]. Is -1 if it never appears.

'writing_sty_aita_lst_location' = The last location of AITA/WIBTA [0,1]. Is -1 if it never appears.

## Profanity
How many profane words appear in the text. Is divided by the number of words in the text if normalised.

'writing_sty_profanity' = How many profane words there are 

## Tense
In what tense the text is written. We only distinguish past, present and future. Divided by number of sentences if normalised.
ML based.

'writing_sty_past' = Amount of past tense.

'writing_sty_present' = Amount of present tense.

'writing_sty_future' = Amount of future tense.

## Active & Passive voice
In what voice the text is written. Divided by number of sentences if normalised. ML based.

'writing_sty_active' = Amount of active voice.

'writing_sty_passive' = Amount of passive voice. 

## Postivity & Subjectivity
How positive/Negative a sentence was written + how subjective/objective it was. ML based. 

'writing_sty_sent_polarity' = How positive/negative the post is. -1 = very negative, 0 = neutral, 1 = very positive

'writing_sty_sent_subjectivity' = How subjective/objective the post is. 0 = very objective, 1 = very subjective

## Focus Pronoun
Check various types of pronouns and count how often they appeared as the subject or object. Also counts possessive pronouns.
Divided by number of words in the text if normalised. ML based. 

'writing_sty_focus_i_subj' = How often pronoun 1. person singular pronoun appeared as the subject.

'writing_sty_focus_you_sg_subj' = How often pronoun 2. person singular pronoun appeared as the subject.

'writing_sty_focus_he_subj' = How often pronoun 3. person singular pronoun appeared as the subject.

'writing_sty_focus_we_subj' = How often pronoun 1. person plural pronoun appeared as the subject.

'writing_sty_focus_you_pl_subj' = How often pronoun 2. person plural pronoun appeared as the subject.

'writing_sty_focus_they_subj' = How often pronoun 3. person plural pronoun appeared as the subject.

'writing_sty_focus_i_obj'= How often pronoun 1. person singular pronoun appeared as the object.

'writing_sty_focus_you_sg_obj'= How often pronoun 2. person singular pronoun appeared as the object.

'writing_sty_focus_he_obj'= How often pronoun 3. person singular pronoun appeared as the object.

'writing_sty_focus_we_obj'= How often pronoun 1. person plural pronoun appeared as the object.

'writing_sty_focus_you_pl_obj'= How often pronoun 2. person plural pronoun appeared as the object.

'writing_sty_focus_they_obj'= How often pronoun 3. person plural pronoun appeared as the object.

'writing_sty_focus_i_poss'= How often 1. person singular possessive pronoun appeared.

'writing_sty_focus_you_sg_poss'= How often 2. person singular possessive pronoun appeared.

'writing_sty_focus_he_poss'= How often 3. person singular possessive pronoun appeared.

'writing_sty_focus_we_poss'= How often 1. person plural possessive pronoun appeared.

'writing_sty_focus_you_pl_poss'= How often 2. person plural possessive pronoun appeared.

'writing_sty_focus_they_poss'= How often 3. person plural possessive pronoun appeared.



## Self vs other emotions
Checks the emotions that are either about the self or other people involved. Self focused sentences are defined as sentences where a 1st pronoun is the subject of a sentence. Other focused sentences are sentences with any other pronoun as subject. Sentences w.o. a pronoun as a subject are ignored (I think). ML based, Dictionary based for emotions.


'writing_sty_self_fear_norm' = How often the emotion fear is in sentences about the self

'writing_sty_self_anger_norm' = How often the emotion anger is in sentences about the self

'writing_sty_self_trust_norm' = How often the emotion trust is in sentences about the self

'writing_sty_self_surprise_norm' = How often the emotion surprise is in sentences about the self

'writing_sty_self_sadness_norm' = How often the emotion sadness is in sentences about the self

'writing_sty_self_disgust_norm' = How often the emotion disgust is in sentences about the self

'writing_sty_self_joy_norm' = How often the emotion joy is in sentences about the self

'writing_sty_self_anticipation_norm' = How often the emotion anticipation is in sentences about the self

'writing_sty_self_positive_norm' = How positive sentences about the self are

'writing_sty_self_negative_norm' = How negative sentences about the self are

'writing_sty_other_fear_norm' = How often the emotion fear is in sentences about others

'writing_sty_other_anger_norm' = How often the emotion anger is in sentences about others

'writing_sty_other_trust_norm' = How often the emotion trust is in sentences about others

'writing_sty_other_surprise_norm' = How often the emotion surprise is in sentences about others

'writing_sty_other_sadness_norm' = How often the emotion sadness is in sentences about others

'writing_sty_other_disgust_norm' = How often the emotion disgust is in sentences about others

'writing_sty_other_joy_norm' = How often the emotion joy is in sentences about others

'writing_sty_other_anticipation_norm' = How often the emotion anticipation is in sentences about others

'writing_sty_other_positive_norm' = How positive sentences about others are

'writing_sty_other_negative_norm' = How negative sentences about others are

## Self vs other profanity
Checks the profanity amount in sentences either about the self or other people involved. Self focused sentences are defined as sentences where a 1st pronoun is the subject of a sentence. Other focused sentences are sentences with any other pronoun as subject. Sentences w.o. a pronoun as a subject are ignored (I think).ML based, Dictionary based for profanity.


'writing_sty_self_prof' = Profanity in sentences about the self

'writing_sty_other_prof' = Profanity in sentences about others