In [1]:
import numpy as np
from sklearn import metrics
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import ast
from sklearn.utils import resample
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

In [2]:
def apr(y_pred, y_real):  # function to calculate the accuracy, precision and recall
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1

In [3]:
df = pd.read_csv('data_raw.csv')
df.head(5)

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
0,I hate thugs.,hateful,racist,yes,1
1,I really can't stand thugs like you.,hateful,racist,yes,1
2,I despise thugs.,hateful,racist,yes,1
3,I detest thugs.,hateful,racist,yes,1
4,I absolutely loathe thugs.,hateful,racist,yes,1


In [4]:
X = df['case_templ']

y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)
Xy_train = pd.concat([X_train, y_train], axis=1)
Xy_train.target.value_counts()

1    5833
0    2150
Name: target, dtype: int64

# Upsampling

In [5]:
majority = Xy_train[Xy_train['target'] == 1]
minority = Xy_train[Xy_train['target'] == 0]
minority_upsampled = resample(minority, replace=True, n_samples=5833, random_state=14)
upsampled = pd.concat([majority, minority_upsampled])
upsampled['target'].value_counts()

1    5833
0    5833
Name: target, dtype: int64

In [6]:
X_train_up = upsampled['case_templ']
y_train_up = upsampled['target']

# TF-IDF Array

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, norm='l1')

In [8]:
X_train_up_vec = vectorizer.fit_transform(X_train_up)
X_test_vec = vectorizer.transform(X_test)

# Train Perceptron

In [9]:
classifier = Perceptron(random_state=14)
classifier.fit(X_train_up_vec, y_train_up)

In [10]:
predictions = classifier.predict(X_test_vec)
score = np.round(metrics.accuracy_score(y_test, predictions), 2)

print('Mean accuracy of predictions: ' + str(score))
apr(predictions, y_test)

Mean accuracy of predictions: 0.97
Accuracy:0.9679358717434869
Precision:0.9725085910652921
Recall:0.9833217512161223
F1:0.9778852798894264


(0.9679358717434869,
 0.9725085910652921,
 0.9833217512161223,
 0.9778852798894264)

# Testing on New Data

#### Creating New Datasets

In [11]:
half_test_df = pd.read_csv('raw_half_test.csv')

In [12]:
Xy_test = pd.concat([X_test, y_test], axis=1)

In [13]:
half_test = half_test_df[['case_templ', 'target']].copy()
half_test_combined = pd.concat([Xy_test, half_test])

In [14]:
half_test_combined_X = half_test_combined['case_templ']
half_test_combined_y = half_test_combined['target']
half_test_X = half_test['case_templ']
half_test_y = half_test['target']

#### Vectorize

In [15]:
half_test_combined_X_vec = vectorizer.transform(half_test_combined_X)
half_test_X_vec = vectorizer.transform(half_test_X)

#### Predictions

In [16]:
half_test_combined_pred = classifier.predict(half_test_combined_X_vec)
half_test_pred = classifier.predict(half_test_X_vec)

#### Results

In [17]:
apr(half_test_combined_pred, half_test_combined_y)
apr(half_test_pred, half_test_y)

Accuracy:0.967098703888335
Precision:0.9719178082191781
Recall:0.9826869806094183
F1:0.9772727272727272
Accuracy:0.8
Precision:0.8
Recall:0.8
F1:0.8000000000000002


(0.8, 0.8, 0.8, 0.8000000000000002)

#### Results Table

In [18]:
half_test_results = half_test_df.copy()
half_test_results['probability'] = classifier.decision_function(half_test_X_vec)
half_test_results['prediction'] = classifier.predict(half_test_X_vec)
half_test_results

Unnamed: 0,case_templ,target,probability,prediction
0,obama is another deadbeat dad,1,0.773074,1
1,another example of the dangerous impact of the...,1,-0.057616,0
2,our streets are now overrun with dindus,1,0.577932,1
3,the demographic of my small town is now what y...,1,0.305509,1
4,this fits right into the cosmopolitan agenda,1,0.3458,1
5,politicians are right scumbags,0,-0.371002,0
6,"ah, a cup of tea, the true great british pasttime",0,-0.567485,0
7,I really hate what the council is doing to our...,0,-0.186486,0
8,Modern media like netflix and hulu are so bori...,0,0.496008,1
9,people do not believe me when I say turkey is ...,0,-0.744219,0


In [19]:
half_test_results.to_csv('Perceptron.csv')