In [1]:
import numpy as np
from model.helper import calculate_metrics
from sklearn.linear_model import LinearRegression
from data.preprocess import load_preprocessed_data
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


In [2]:
data1 = load_preprocessed_data("sexism_binary")

print(data1["x_train"].shape)
print(data1["y_train"].shape)
print(data1["x_valid"].shape)
print(data1["y_valid"].shape)
print(data1["x_test"].shape)
print(data1["y_test"].shape)

preprocessed file already exists in /home/homes/jhpark/hate-speech/data/preprocessed/
(11401,)
(11401,)
(2445,)
(2445,)
(2446,)
(2446,)


In [3]:
data2 = load_preprocessed_data("racism_binary")

print(data2["x_train"].shape)
print(data2["y_train"].shape)
print(data2["x_valid"].shape)
print(data2["y_valid"].shape)
print(data2["x_test"].shape)
print(data2["y_test"].shape)

preprocessed file already exists in /home/homes/jhpark/hate-speech/data/preprocessed/
(10138,)
(10138,)
(2174,)
(2174,)
(2175,)
(2175,)


In [17]:
def evaluate(pred_scores, target):
    pred = list(map(lambda x: 1 if x >= 0.5 else 0, pred_scores))
    precision, recall, f1 = calculate_metrics(target, pred)
    print("Training: Precision=%.2f Recall=%.2f, F1=%.2f" %(precision, recall, f1))
    return f1

In [5]:
def ngram_test(ngram_type, data):
    lr = LinearRegression()

    best_param = -1
    best = 0

    for param in [(1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5)]:
        print("ngram-counts=" + str(param))
    
        print("\nTraining:")
        vect = CountVectorizer(ngram_range=param, analyzer=ngram_type)
        X_train_counts = vect.fit_transform(data["x_train"])
        print(X_train_counts.shape)
        
        lr.fit(X_train_counts, data["y_train"])
        evaluate(lr.predict(X_train_counts), data["y_train"])
    
        print("\nValidation:")
        X_valid_counts = vect.transform(data["x_valid"])
        f1 = evaluate(lr.predict(X_valid_counts), data["y_valid"])
    
        if f1 > best[2]:
            best = (precision, recall, f1)
            best_param = param
        
    print("\n\nBEST parameter for %s is %s" % (ngram_type, str(best_param)))        
    print("Validation F1 score=%.4f" % best)

## Word-ngram

### Sexism

In [6]:
ngram_test("word", data1)

ngram-counts=(1, 2)

Training:
(11401, 94151)
Training: Precision=0.99 Recall=0.98, F1=0.99

Validation:
Training: Precision=0.65 Recall=0.54, F1=0.59
ngram-counts=(1, 3)

Training:
(11401, 206414)
Training: Precision=0.99 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.66 Recall=0.53, F1=0.59
ngram-counts=(1, 4)

Training:
(11401, 317519)
Training: Precision=0.98 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.67 Recall=0.52, F1=0.59
ngram-counts=(1, 5)

Training:
(11401, 420433)
Training: Precision=0.98 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.67 Recall=0.51, F1=0.58
ngram-counts=(2, 3)

Training:
(11401, 192342)
Training: Precision=0.98 Recall=0.99, F1=0.98

Validation:
Training: Precision=0.70 Recall=0.31, F1=0.43
ngram-counts=(2, 4)

Training:
(11401, 303447)
Training: Precision=0.99 Recall=0.98, F1=0.98

Validation:
Training: Precision=0.71 Recall=0.28, F1=0.40
ngram-counts=(2, 5)

Training:
(11401, 406361)
Training: Precision=0.99 Recall=0.98, F1=0

### Racism

In [7]:
ngram_test("word", data2)

ngram-counts=(1, 2)

Training:
(10138, 85960)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.79 Recall=0.57, F1=0.66
ngram-counts=(1, 3)

Training:
(10138, 187925)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.81 Recall=0.55, F1=0.66
ngram-counts=(1, 4)

Training:
(10138, 288810)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.83 Recall=0.53, F1=0.65
ngram-counts=(1, 5)

Training:
(10138, 382131)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.83 Recall=0.51, F1=0.63
ngram-counts=(2, 3)

Training:
(10138, 175018)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.79 Recall=0.31, F1=0.45
ngram-counts=(2, 4)

Training:
(10138, 275903)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.82 Recall=0.29, F1=0.43
ngram-counts=(2, 5)

Training:
(10138, 369224)
Training: Precision=1.00 Recall=1.00, F1=1

## Char ngram

### Sexism

In [9]:
ngram_test("char", data1)

ngram-counts=(1, 2)

Training:
(11401, 2643)
Training: Precision=0.88 Recall=0.63, F1=0.73

Validation:
Training: Precision=0.71 Recall=0.53, F1=0.60
ngram-counts=(1, 3)

Training:
(11401, 17602)
Training: Precision=0.99 Recall=0.97, F1=0.98

Validation:
Training: Precision=0.35 Recall=0.55, F1=0.43
ngram-counts=(1, 4)

Training:
(11401, 70131)
Training: Precision=0.99 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.53 Recall=0.61, F1=0.57
ngram-counts=(1, 5)

Training:
(11401, 195501)
Training: Precision=0.98 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.66 Recall=0.62, F1=0.64
ngram-counts=(2, 3)

Training:
(11401, 17375)
Training: Precision=0.99 Recall=0.97, F1=0.98

Validation:
Training: Precision=0.35 Recall=0.55, F1=0.43
ngram-counts=(2, 4)

Training:
(11401, 69904)
Training: Precision=0.99 Recall=0.98, F1=0.99

Validation:
Training: Precision=0.54 Recall=0.62, F1=0.57
ngram-counts=(2, 5)

Training:
(11401, 195274)
Training: Precision=0.98 Recall=0.99, F1=0.99



### Racism

In [10]:
ngram_test("char", data2)

ngram-counts=(1, 2)

Training:
(10138, 2577)
Training: Precision=0.87 Recall=0.67, F1=0.76

Validation:
Training: Precision=0.71 Recall=0.56, F1=0.63
ngram-counts=(1, 3)

Training:
(10138, 16615)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.24 Recall=0.58, F1=0.34
ngram-counts=(1, 4)

Training:
(10138, 64916)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.51 Recall=0.60, F1=0.55
ngram-counts=(1, 5)

Training:
(10138, 179045)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.71 Recall=0.63, F1=0.67
ngram-counts=(2, 3)

Training:
(10138, 16378)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.24 Recall=0.57, F1=0.34
ngram-counts=(2, 4)

Training:
(10138, 64679)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.51 Recall=0.60, F1=0.55
ngram-counts=(2, 5)

Training:
(10138, 178808)
Training: Precision=1.00 Recall=1.00, F1=1.00



## Combining results

In [19]:
def get_best(data):
    lr = LinearRegression()
    vect = CountVectorizer(ngram_range=(2,5), analyzer="char")
    X_train_counts = vect.fit_transform(data["x_train"])
    print(X_train_counts.shape)
        
    lr.fit(X_train_counts, data["y_train"])
    evaluate(lr.predict(X_train_counts), data["y_train"])
    
    print("\nValidation:")
    X_valid_counts = vect.transform(data["x_valid"])
    pred_scores = lr.predict(X_valid_counts)
    evaluate(pred_scores, data["y_valid"])
    return lr, pred_scores

In [20]:
s_lr, s_pred = get_best(data1)
r_lr, r_pred = get_best(data2)

(11401, 195274)
Training: Precision=0.98 Recall=0.99, F1=0.99

Validation:
Training: Precision=0.65 Recall=0.62, F1=0.63
(10138, 178808)
Training: Precision=1.00 Recall=1.00, F1=1.00

Validation:
Training: Precision=0.71 Recall=0.63, F1=0.67


In [21]:
s_pred = list(map(lambda x: 1 if x >= 0.5 else 0, s_pred))
r_pred = list(map(lambda x: 1 if x >= 0.5 else 0, r_pred))


In [45]:
def count(data, pred, truth):
    assert len(pred) == len(truth)
    true_positive = 0
    false_positive = 0
    false_negative = 0
    
    for i in range(len(pred)):
        if truth[i] == 1:
            if pred[i] == 1:
                true_positive += 1
            if pred[i] == 0:
                false_negative += 1
        else:
            if pred[i] == 1:
                print(data["x_valid"][i] + "\n")
                false_positive += 1
    return true_positive, false_positive, false_negative


In [46]:
tp_1, fp_1, fn_1 = count(data1, s_pred, data1["y_valid"])

yesallwomen are going to shout over wastes of carbon material like until they are heard. you are a drop in the ocean, pal.

stan got this a+ last word before blocking me: ever heard of caustic sarcasm? rename yourself thanks awfully *mwah!* xxx

apparently walking a catwalk involves putting one foot in front of the other...you know, like walking mkr

really a cat walk that's so ridiculous mkr

i believe so. should be a client-side option (blocks processing happens client side as well)

calls for a march of a million tomorrow to remove jordan from coalition, but some mourners say this is not the time or p…

my latest re patricia arquette. | the road to structural erasure is paved with good-intentioned white ladies ablc http…

yay. i went last year and had to make new friends. i hate making new friends. i didn't know anyone else going.

i hate racism comment on blameonenotall: do men who don't hurt women actually need to be congratulated?

watching "india's daughter", horrifying. lawyer 

In [32]:
tp_2, fp_2, fn_2 = count(r_pred, data2["y_valid"])

In [34]:
precision = (tp_1 + tp_2) / (tp_1 + tp_2 + fp_1 + fp_2)
print(precision)

0.6727493917274939


In [35]:
recall = (tp_1 + tp_2) / (tp_1 + tp_2 + fn_1 + fn_2)
print(recall)

0.6220472440944882


In [36]:
print(2*precision*recall/(precision+recall))

0.6464056107539452
