In [48]:
import random
import os
from tqdm import tqdm
from typing import *
from collections import defaultdict
from math import log2
import string
from nltk.stem import PorterStemmer
import numpy as np

from sklearn.linear_model import LogisticRegression

from predeal_dataset import *

DICTIONARY_SIZE = 1000

puncts = string.punctuation

stemmer = PorterStemmer()

def tokenize(sentence:str,use_stemmer:bool=True)->List[str]:
    sentence = sentence.replace("(","( ").replace("[","[ ").replace("{","{ ")
    res = list()
    tmp = sentence.split()
    for word in tmp:
        if len(word)==0:
            continue
        if word[-1] in puncts:
            p = word[-1]
            word = word[:-1]
            if len(word)>0:
                word = word.lower()
                if use_stemmer:
                    word = stemmer.stem(word)
                res.append(word)
            res.append(p)
        else:
            word = word.lower()
            if use_stemmer:
                word = stemmer.stem(word)
            res.append(word)
    return res


tokenize("I love eatting bananas!")

['i', 'love', 'eat', 'banana', '!']

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
spoiler_dataset = sample_sub_spoiler_set(SUBSET_SENTENCE_CNT)

In [4]:
word_cnt = defaultdict(int)

for datum in tqdm(spoiler_dataset):
    sentence = datum['review_sentence']
    for word in tokenize(sentence):
        word_cnt[word]+=1

100%|██████████| 50000/50000 [01:46<00:00, 468.74it/s]


In [5]:
word_with_freq = list(word_cnt.items())
word_with_freq.sort(key=lambda tup:tup[1],reverse=True)

In [6]:
dictionary = word_with_freq[:DICTIONARY_SIZE]
dictionary = list(map(lambda tup:tup[0],dictionary))
word2id = {word:i for i,word in enumerate(dictionary)}

In [77]:
def features(datum):
    res = [0]*DICTIONARY_SIZE
    words = tokenize(datum['review_sentence'])
    for word in words:
        if not word in dictionary:
            continue
        res[word2id[word]]+=1
    res.append(len(words)+1)
    return res

In [78]:
def getXsAndYs(spoiler_dataset:List[dict])->Tuple[np.ndarray,np.ndarray]:
    resX = list()
    resY = list()
    for datum in tqdm(spoiler_dataset):
        resX.append(features(datum))
        resY.append(datum['label'])
    return np.array(resX,dtype=float),np.array(resY)

In [79]:
random.seed(42)
random.shuffle(spoiler_dataset)
trainset,validset,testset = spoiler_dataset[:TRAIN_SET_CNT],spoiler_dataset[TRAIN_SET_CNT:-TEST_SET_CNT],spoiler_dataset[-TEST_SET_CNT:]


In [80]:
trainX,trainY = getXsAndYs(trainset)
validX,validY = getXsAndYs(validset)
testX,testY = getXsAndYs(testset)

  0%|          | 0/40000 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [01:40<00:00, 399.70it/s]
100%|██████████| 5000/5000 [00:13<00:00, 363.32it/s]
100%|██████████| 5000/5000 [00:13<00:00, 367.17it/s]


In [81]:
class_weights = dict(zip(np.unique(trainY), len(trainY) / (len(np.unique(trainY)) * np.bincount(trainY))))
model = LogisticRegression(penalty='l2', C=1.0, class_weight=class_weights)
model.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
def get_best_ths_with_ber(pred_prop_with_label): 
    pred_prop_with_label.sort(reverse=True)
    valid_set_pos_cnt = sum(tup[1] for tup in pred_prop_with_label)
    valid_set_neg_cnt = len(pred_prop_with_label)-valid_set_pos_cnt
    best_ths = 1.0
    best_ber = 0.5
    curr_false_positive = 0
    curr_false_negative = valid_set_pos_cnt
    for (prob,label) in pred_prop_with_label:
        ths = prob-0.00001
        if label==1:
            curr_false_negative-=1
        else:
            curr_false_positive+=1
        ber = 0.5*(curr_false_negative/valid_set_pos_cnt+curr_false_positive/valid_set_neg_cnt)
        if ber<best_ber:
            best_ber = ber
            best_ths = ths
    return best_ths,best_ber
get_best_ths_with_ber(list(zip(map(lambda tup:tup[1],model.predict_proba(validX)),validY)))

(0.4276618023428124, 0.2963662752137095)

In [82]:
validYPred = model.predict(validX)

In [83]:
def get_performance_info(y_actual,y_predict):
    y_actual = np.array(y_actual)
    y_predict = np.array(y_predict)
    y_actual = y_actual.reshape((-1,))
    y_predict = y_predict.reshape((-1,))
    TP = np.sum((y_actual == 1) & (y_predict == 1))
    FP = np.sum((y_actual == 0) & (y_predict == 1))
    TN = np.sum((y_actual == 0) & (y_predict == 0))
    FN = np.sum((y_actual == 1) & (y_predict == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    accu = np.sum(y_actual==y_predict)/len(y_actual)
    return accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

In [84]:
get_performance_info(validY,validYPred)

(0.7648,
 196,
 1041,
 3628,
 135,
 0.5921450151057401,
 0.22295994859713,
 0.77704005140287,
 0.4078549848942598,
 0.315407466745695)