In [2]:
import numpy as np
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import ast
from sklearn.utils import resample

In [3]:
def apr(y_pred, y_real):  # function to calculate the accuracy, precision and recall
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1

In [4]:
df = pd.read_csv('data_raw.csv')
df.head(5)

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
0,I hate thugs.,hateful,racist,yes,1
1,I really can't stand thugs like you.,hateful,racist,yes,1
2,I despise thugs.,hateful,racist,yes,1
3,I detest thugs.,hateful,racist,yes,1
4,I absolutely loathe thugs.,hateful,racist,yes,1


In [5]:
X = df['case_templ']

y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)
Xy_train = pd.concat([X_train, y_train], axis=1)
Xy_train.target.value_counts()

1    5833
0    2150
Name: target, dtype: int64

In [6]:
majority = Xy_train[Xy_train['target'] == 1]
minority = Xy_train[Xy_train['target'] == 0]
minority_upsampled = resample(minority, replace=True, n_samples=5833, random_state=14)
upsampled = pd.concat([majority, minority_upsampled])
upsampled['target'].value_counts()

1    5833
0    5833
Name: target, dtype: int64

In [7]:
X_train_up = upsampled['case_templ']
y_train_up = upsampled['target']

In [8]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, norm='l1')
X_train_up_vec = vectorizer.fit_transform(X_train_up)
X_test_vec = vectorizer.transform(X_test)

In [9]:
classifier = MLPClassifier(hidden_layer_sizes=5, max_iter=200, activation='relu', verbose=10, random_state=14, learning_rate='invscaling')
classifier.fit(X_train_up_vec, y_train_up)

Iteration 1, loss = 0.78758231
Iteration 2, loss = 0.72924825
Iteration 3, loss = 0.66834375
Iteration 4, loss = 0.61016830
Iteration 5, loss = 0.55766835
Iteration 6, loss = 0.50419484
Iteration 7, loss = 0.45335726
Iteration 8, loss = 0.40755089
Iteration 9, loss = 0.36695053
Iteration 10, loss = 0.33163238
Iteration 11, loss = 0.30106739
Iteration 12, loss = 0.27452752
Iteration 13, loss = 0.25160281
Iteration 14, loss = 0.23172702
Iteration 15, loss = 0.21438019
Iteration 16, loss = 0.19913051
Iteration 17, loss = 0.18568258
Iteration 18, loss = 0.17379738
Iteration 19, loss = 0.16314073
Iteration 20, loss = 0.15355134
Iteration 21, loss = 0.14488313
Iteration 22, loss = 0.13701511
Iteration 23, loss = 0.12984813
Iteration 24, loss = 0.12334920
Iteration 25, loss = 0.11735998
Iteration 26, loss = 0.11184729
Iteration 27, loss = 0.10679931
Iteration 28, loss = 0.10213576
Iteration 29, loss = 0.09775391
Iteration 30, loss = 0.09370452
Iteration 31, loss = 0.08985153
Iteration 32, los

# Testing on custom data

#### Creating Data

In [10]:
half_test_df = pd.read_csv('raw_half_test.csv')
Xy_test = pd.concat([X_test, y_test], axis=1)
half_test = half_test_df[['case_templ', 'target']].copy()
half_test_combined = pd.concat([Xy_test, half_test])
half_test_combined_X = half_test_combined['case_templ']
half_test_combined_y = half_test_combined['target']
half_test_X = half_test['case_templ']
half_test_y = half_test['target']

#### Vectorizing

In [11]:
half_test_combined_X_vec = vectorizer.transform(half_test_combined_X)
half_test_X_vec = vectorizer.transform(half_test_X)

#### Predictions

In [12]:
half_test_combined_pred = classifier.predict(half_test_combined_X_vec)
half_test_pred = classifier.predict(half_test_X_vec)

#### Results

In [13]:
apr(half_test_combined_pred, half_test_combined_y)
apr(half_test_pred, half_test_y)

Accuracy:0.9720837487537388
Precision:0.969553450608931
Recall:0.9923822714681441
F1:0.9808350444900753
Accuracy:0.7
Precision:0.75
Recall:0.6
F1:0.6666666666666665


(0.7, 0.75, 0.6, 0.6666666666666665)

#### Results Table

In [14]:
half_test_X_vec = vectorizer.transform(half_test_X)
half_test_y_pred = classifier.predict(half_test_X_vec)
half_test_y_proba = classifier.predict_proba((half_test_X_vec))[:,1]

In [15]:
half_test_results = half_test_df.copy()
half_test_results['probability'] = np.round((half_test_y_proba), 5)
half_test_results['prediction'] = half_test_y_pred
half_test_results

Unnamed: 0,case_templ,target,probability,prediction
0,obama is another deadbeat dad,1,1.0,1
1,another example of the dangerous impact of the...,1,9e-05,0
2,our streets are now overrun with dindus,1,1.0,1
3,the demographic of my small town is now what y...,1,0.63762,1
4,this fits right into the cosmopolitan agenda,1,0.00454,0
5,politicians are right scumbags,0,0.0,0
6,"ah, a cup of tea, the true great british pasttime",0,0.0,0
7,I really hate what the council is doing to our...,0,0.00087,0
8,Modern media like netflix and hulu are so bori...,0,0.99999,1
9,people do not believe me when I say turkey is ...,0,0.0,0


In [16]:
half_test_results.to_csv('MLP.csv')