In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv("data.csv", parse_dates=['CreationDate'])
data = data.sort_values("CreationDate")

In [3]:
data.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
2,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
3,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
4,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ


In [4]:
new_data = []
for ts, tags, y in data[['CreationDate','Tags', 'Y']].values:
    tags_ = re.findall("<([\w\d\-]+)", tags)
    y_ = 1 if 'HQ' in y else 0 
    
    data_pt = [ts, tags_, y_]
    new_data.append(data_pt)

In [5]:
like = new_data[:20000]
train = new_data[20000:40000]
test = new_data[40000:]

In [20]:
train[1]

[Timestamp('2016-12-25 10:55:47'), ['javascript', 'html'], 0]

# Likelihood


In [6]:
global_like = 0

tag_like_y = dict()
tag_like_num = dict()

for ts, tags, y  in like:
    global_like += y
    
    for t in tags:
        tag_like_y[t] = tag_like_y.get(t, 0) + y
        tag_like_num[t] = tag_like_num.get(t, 0) + 1

In [7]:
tag_like = dict()

for tag in tag_like_num.keys():
    tag_like[tag] = tag_like_y[tag] / tag_like_num[tag]

# Bias factor

In [8]:
global_bias = global_like / 20000
    
tag_bias_factor = dict()
for tag, bias in tag_like.items():
    tag_bias_factor[tag] = bias / global_bias

# Bias factor weighted

## P-valor

In [9]:
from scipy.stats import binom

In [10]:
tag_pvals = dict()
global_bias = 0.4093

for tag in tag_like_num.keys():
    k = tag_like_y[tag]
    n = tag_like_num[tag]
    tag_pvals[tag] = np.min([binom.cdf(k=k, n=n, p=global_bias), binom.sf(k=k-1, n=n, p=global_bias)])

In [11]:
def logit(pval, bias_denominator=12): 
    pval = np.max([pval, 1e-5])
    inverse_pval = 1 - pval
    logit_ = np.log(inverse_pval) - np.log(1 - inverse_pval)
    
    return np.max([0, logit_]) / bias_denominator #logit of 1-pval, menor pval = mais peso

tag_pvals_logit = dict()
for tag, pval in tag_pvals.items():
    tag_pvals_logit[tag] = logit(pval)

In [12]:
tag_bias_factor_weighted = dict()

for tag, bias in tag_bias_factor.items():
    tag_bias_factor_weighted[tag] = tag_bias_factor[tag] ** tag_pvals_logit[tag]

# Models

In [13]:
def gen_features(data, tag_dict, return_y=True):
    feature_col = []
    Y = []

    for ts, tags, y in data:

        feature_row = []

        for tag in tags:
            if tag not in tag_dict:
                feature_row.append(1)
                continue
            feature_row.append(tag_dict[tag])
            
        feature_col.append(np.mean(feature_row))
        Y.append(y)
        
    feature_col = np.array(feature_col)
    feature_col[np.isnan(feature_col)] = 1. # nan = 1.
    feature_col = feature_col.reshape(-1,1) #sklearn
    
    Y = np.array(Y)
        
    if return_y:
        return feature_col, Y
    return feature_col

In [14]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

## Tag Like

In [15]:
feature_col_tag_like_tr, Y_tr = gen_features(train, tag_like)
feature_col_tag_like_ts, Y_ts = gen_features(test, tag_like)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_like_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_like_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_like_ts)[:,1])
print("ROC AUC - Feature = {} - Model = {}".format(raw_roc, model_roc))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ROC AUC - Feature = 0.8828576104225871 - Model = 0.8856311290653981


## Tag Bias Factor

In [16]:
feature_col_tag_bias_factor_tr, Y_tr = gen_features(train, tag_bias_factor)
feature_col_tag_bias_factor_ts, Y_ts = gen_features(test, tag_bias_factor)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_bias_factor_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_bias_factor_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_bias_factor_ts)[:,1])
print("ROC AUC - Feature = {} - Model = {}".format(raw_roc, model_roc))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ROC AUC - Feature = 0.8605241884273186 - Model = 0.8725653372975385


## Tag P-Vals

In [17]:
feature_col_tag_pvals_tr, Y_tr = gen_features(train, tag_pvals)
feature_col_tag_pvals_ts, Y_ts = gen_features(test, tag_pvals)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_pvals_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_pvals_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_pvals_ts)[:,1])
print("ROC AUC - Feature = {} - Model = {}".format(raw_roc, model_roc)) ## maior p-val, mais chance de ser de qualidade .... sample size? popularidade da tag?

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ROC AUC - Feature = 0.6775924316381104 - Model = 0.7288903811961154


## Tag Weighted Bias 

In [18]:
feature_col_tag_bias_factor_weighted_tr, Y_tr = gen_features(train, tag_bias_factor_weighted)
feature_col_tag_bias_factor_weighted_ts, Y_ts = gen_features(test, tag_bias_factor_weighted)

mdl = LGBMClassifier(random_state=0)
mdl.fit(feature_col_tag_bias_factor_weighted_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, feature_col_tag_bias_factor_weighted_ts)
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(feature_col_tag_bias_factor_weighted_ts)[:,1])
print("ROC AUC - Feature = {} - Model = {}".format(raw_roc, model_roc))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


ROC AUC - Feature = 0.8441389961082729 - Model = 0.8625036086374923


## Tags Like and P-Vals

In [19]:
mx_tr = np.hstack([feature_col_tag_like_tr, feature_col_tag_pvals_tr])
mx_ts = np.hstack([feature_col_tag_like_ts, feature_col_tag_pvals_ts])

mdl = LGBMClassifier(random_state=0)
mdl.fit(mx_tr, Y_tr)

raw_roc = roc_auc_score(Y_ts, mx_ts.mean(axis=1))
model_roc = roc_auc_score(Y_ts, mdl.predict_proba(mx_ts)[:,1])
print("ROC AUC - Feature = {} - Model = {}".format(raw_roc, model_roc))

ROC AUC - Feature = 0.8639511544581207 - Model = 0.8895922280392354
