In [64]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils import resample

In [65]:
df = pd.read_csv('data_lem.csv')

In [66]:
import ast ## This module can be used to evaluate literals, eg: transform string-lists back into lists
def extract_genres(x):
    x = ast.literal_eval(x)
    return x

In [67]:
df['lemmatized'] = df['lemmatized'].apply(extract_genres)

In [68]:
df['lemmatized'] = [' '.join(x) for x in df['lemmatized']]

In [69]:
X_a = df['lemmatized']

y_a = df['target']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size=0.2, random_state=14)

In [70]:
Xy_train_a = pd.concat([X_train_a, y_train_a], axis=1)
Xy_train_a

Unnamed: 0,lemmatized,target
1839,nothing common ghetto trahs,1
7092,never support politician treat autoandrophiles...,0
9024,I'd rather cancre anywhere near Aiden,1
5473,banker c n c e r,1
9764,Saying make bit bastard,0
...,...,...
7526,clownfishs beautiful,0
6471,hurt autogynephiles like,1
2454,bings belong zoo,1
9484,obvious troons opposite stupid,0


In [71]:
Xy_train_a.target.value_counts()

1    5833
0    2150
Name: target, dtype: int64

In [72]:
majority = Xy_train_a[Xy_train_a['target'] == 1]
minority = Xy_train_a[Xy_train_a['target'] == 0]
minority_upsampled = resample(minority, replace=True, n_samples=5833, random_state=14)
upsampled = pd.concat([majority, minority_upsampled])
upsampled['target'].value_counts()

1    5833
0    5833
Name: target, dtype: int64

In [73]:
X_train_up_a = upsampled['lemmatized']
y_train_up_a = upsampled['target']

# Ngrams Main Dataset

In [74]:
# ngram_list = [(1,1), (1,2), (1,3), (2,2)]
# train_accuracy = []
# test_accuracy = []
# for x, y in ngram_list:
#     vectorizer = CountVectorizer(analyzer = "word",
#                                  lowercase=True,
#                                  tokenizer = None,
#                                  preprocessor = None,
#                                  stop_words = None,
#                                  ngram_range=(x, y),
#                                  max_features = 340)
#
#     X_train_up_vec = vectorizer.fit_transform(X_train_up_a)
#     X_test_vec = vectorizer.transform(X_test_a)
#
#     classifier = LogisticRegression(max_iter=1000)
#     classifier.fit(X_train_up_vec, y_train_up_a)
#
#     y_pred_train = classifier.predict(X_train_up_vec)
#     acc = accuracy_score(y_train_up_a, y_pred_train)
#     train_accuracy.append(acc)
#
#     y_pred_test = classifier.predict(X_test_vec)
#     acc2 = accuracy_score(y_test, y_pred_test)
#     test_accuracy.append(acc2)
#
# data = {'max_features': ngram_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}
#
# ngram_df = pd.DataFrame(data)

In [75]:
# ngram_df

##### Model with ngrams 1,2 and normal data

In [76]:
vectorizer_a = CountVectorizer(analyzer = "word",
                             lowercase=True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 300,
                             ngram_range=(1,2))

X_train_up_vec_a = vectorizer_a.fit_transform(X_train_up_a)
X_test_vec_a = vectorizer_a.transform(X_test_a)

classifier_a = LogisticRegression(max_iter=1000)
classifier_a.fit(X_train_up_vec_a, y_train_up_a)

## Ngrams With No Stopwords Removed Dataset

In [77]:
new = pd.read_csv('data_ngrams.csv')
new

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target,case_templ_token,case_templ_no_stopwords,case_templ_clean,lemmatized
0,I hate thugs.,hateful,racist,yes,1,"['I', 'hate', 'thugs', '.']","['hate', 'thugs', '.']","['I', 'hate', 'thugs']","['I', 'hate', 'thug']"
1,I really can't stand thugs like you.,hateful,racist,yes,1,"['I', 'really', ""can't"", 'stand', 'thugs', 'li...","['really', ""can't"", 'stand', 'thugs', 'like', ...","['I', 'really', ""can't"", 'stand', 'thugs', 'li...","['I', 'really', ""can't"", 'stand', 'thug', 'lik..."
2,I despise thugs.,hateful,racist,yes,1,"['I', 'despise', 'thugs', '.']","['despise', 'thugs', '.']","['I', 'despise', 'thugs']","['I', 'despise', 'thug']"
3,I detest thugs.,hateful,racist,yes,1,"['I', 'detest', 'thugs', '.']","['detest', 'thugs', '.']","['I', 'detest', 'thugs']","['I', 'detest', 'thug']"
4,I absolutely loathe thugs.,hateful,racist,yes,1,"['I', 'absolutely', 'loathe', 'thugs', '.']","['absolutely', 'loathe', 'thugs', '.']","['I', 'absolutely', 'loathe', 'thugs']","['I', 'absolutely', 'loathe', 'thug']"
...,...,...,...,...,...,...,...,...,...
9974,Pro lifers should get the fuck out of our coun...,non-hateful,non-template,no,0,"['Pro', 'lifers', 'should', 'get', 'the', 'fuc...","['Pro', 'lifers', 'get', 'fuck', 'country', '.']","['Pro', 'lifers', 'should', 'get', 'the', 'fuc...","['Pro', 'lifer', 'should', 'get', 'the', 'fuck..."
9975,Most gun dealers are seriously fucked in the h...,non-hateful,non-template,no,0,"['Most', 'gun', 'dealers', 'are', 'seriously',...","['gun', 'dealers', 'seriously', 'fucked', 'hea...","['Most', 'gun', 'dealers', 'are', 'seriously',...","['Most', 'gun', 'dealer', 'are', 'seriously', ..."
9976,"By nature, corporate accountants are just shit...",non-hateful,non-template,no,0,"['By', 'nature', ',', 'corporate', 'accountant...","['nature', ',', 'corporate', 'accountants', 's...","['By', 'nature', 'corporate', 'accountants', '...","['By', 'nature', 'corporate', 'accountant', 'a..."
9977,Most real estate agents are absolute bastards.,non-hateful,non-template,no,0,"['Most', 'real', 'estate', 'agents', 'are', 'a...","['real', 'estate', 'agents', 'absolute', 'bast...","['Most', 'real', 'estate', 'agents', 'are', 'a...","['Most', 'real', 'estate', 'agent', 'are', 'ab..."


In [78]:
new['lemmatized'] = new['lemmatized'].apply(extract_genres)
new['lemmatized'] = [' '.join(x) for x in new['lemmatized']]

In [79]:
X_b = new['lemmatized']

y_b = new['target']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=14)

In [80]:
Xy_train_b = pd.concat([X_train_b, y_train_b], axis=1)
Xy_train_b

Unnamed: 0,lemmatized,target
1839,You are nothing but common ghetto trahs,1
7092,We should never support a politician that trea...,0
9024,I'd rather have cancre than be anywhere near a...,1
5473,banker are a c a n c e r,1
9764,Saying that doe make me a bit of a bastard,0
...,...,...
7526,clownfishs are beautiful,0
6471,I will hurt you and any other autogynephiles l...,1
2454,bings belong in a zoo,1
9484,It's obvious that troons are the very opposite...,0


In [81]:
majority = Xy_train_b[Xy_train_b['target'] == 1]
minority = Xy_train_b[Xy_train_b['target'] == 0]
minority_upsampled = resample(minority, replace=True, n_samples=5833, random_state=14)
upsampled = pd.concat([majority, minority_upsampled])
upsampled['target'].value_counts()

1    5833
0    5833
Name: target, dtype: int64

In [82]:
X_train_up_b = upsampled['lemmatized']
y_train_up_b = upsampled['target']

In [83]:
# ngram_list = [(1,1), (1,2), (1,3), (2,2)]
# train_accuracy = []
# test_accuracy = []
# for x, y in ngram_list:
#     vectorizer = CountVectorizer(analyzer = "word",
#                                  lowercase=True,
#                                  tokenizer = None,
#                                  preprocessor = None,
#                                  stop_words = None,
#                                  ngram_range=(x, y),
#                                  max_features = 340)
#
#     X_train_up_vec = vectorizer.fit_transform(X_train_up_b)
#     X_test_vec = vectorizer.transform(X_test_b)
#
#     classifier = LogisticRegression(max_iter=1000)
#     classifier.fit(X_train_up_vec, y_train_up_b)
#
#     y_pred_train = classifier.predict(X_train_up_vec)
#     acc = accuracy_score(y_train_up_b, y_pred_train)
#     train_accuracy.append(acc)
#
#     y_pred_test = classifier.predict(X_test_vec)
#     acc2 = accuracy_score(y_test, y_pred_test)
#     test_accuracy.append(acc2)
#
# data = {'max_features': ngram_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}
#
# ngram_df = pd.DataFrame(data)

In [84]:
# ngram_df

In [85]:
vectorizer_b = CountVectorizer(analyzer = "word",
                             lowercase=True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 300,
                             ngram_range=(1,2))

X_train_up_vec_b = vectorizer_b.fit_transform(X_train_up_b)
X_test_vec_b = vectorizer_b.transform(X_test_b)

classifier_b = LogisticRegression(max_iter=1000)
classifier_b.fit(X_train_up_vec_b, y_train_up_b)

## Testing Both Against New Data

#### Creating Brand New Data

In [86]:
sentences_dogwhistles = ['our streets are now overrun with dindus', 'hate to see these welfare queens with their new iphones', 'if we are not careful they will turn your child into a clownfish', 'obama is another deadbeat dad', 'this fits right into the cosmopolitan agenda', 'the demographic of my small town is now what you might call urban', 'these cultural marxists will have us mating with thugs', 'imagine finding out your child is a durden', 'we have to take down the coastal elite', 'another example of the dangerous impact of the genderist agenda on our children']

sentences_not_dogwhistles = ['I really hate what the council is doing to our streets', 'I love indian food, give me all the spice you have', 'why do the crows look at me so funny?', 'politicians are right scumbags', 'people do not believe me when I say turkey is a great holiday destination', 'Modern media like netflix and hulu are so boring and melting our brains', 'we all have a responsibility to preserve our town centers in the best condition possible', 'hated what man u did last week, dreadful defending', 'the weatehr is always so shit in this country', 'ah, a cup of tea, the true great british pasttime']

print(len(sentences_dogwhistles))
print(len(sentences_not_dogwhistles))

10
10


In [87]:
test_dw_df = pd.DataFrame(sentences_dogwhistles)
test_dw_df.rename(columns={0:'case_templ'}, inplace=True)
test_dw_df['target'] = 1
test_dw_df

Unnamed: 0,case_templ,target
0,our streets are now overrun with dindus,1
1,hate to see these welfare queens with their ne...,1
2,if we are not careful they will turn your chil...,1
3,obama is another deadbeat dad,1
4,this fits right into the cosmopolitan agenda,1
5,the demographic of my small town is now what y...,1
6,these cultural marxists will have us mating wi...,1
7,imagine finding out your child is a durden,1
8,we have to take down the coastal elite,1
9,another example of the dangerous impact of the...,1


In [88]:
test_ndw_df = pd.DataFrame(sentences_not_dogwhistles)
test_ndw_df.rename(columns={0:'case_templ'}, inplace=True)
test_ndw_df['target'] = 0
test_ndw_df

Unnamed: 0,case_templ,target
0,I really hate what the council is doing to our...,0
1,"I love indian food, give me all the spice you ...",0
2,why do the crows look at me so funny?,0
3,politicians are right scumbags,0
4,people do not believe me when I say turkey is ...,0
5,Modern media like netflix and hulu are so bori...,0
6,we all have a responsibility to preserve our t...,0
7,"hated what man u did last week, dreadful defen...",0
8,the weatehr is always so shit in this country,0
9,"ah, a cup of tea, the true great british pasttime",0


#### Processing Original

In [89]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def data_prep(data):

    # tokenization
    data['case_templ_token'] = data['case_templ'].apply(lambda x: tokenizer.tokenize(x))

    # Removing Stopwords
    def remove_stopwords(tokens):
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        return filtered_tokens
    data['case_templ_no_stopwords'] = data['case_templ_token'].apply(remove_stopwords)

    # Removing Punctuation
    def remove_punctuation(tokens):
        clean_tokens = [token for token in tokens if token not in punctuation]
        return clean_tokens
    data['case_templ_clean'] = data['case_templ_no_stopwords'].apply(remove_punctuation)

    # Lemmatize
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    data['lemmatized'] = data['case_templ_clean'].apply(lemmatize_tokens)

    return data

In [90]:
a_test_dw_df = data_prep(test_dw_df)
a_test_dw_df_full = a_test_dw_df[['case_templ', 'lemmatized', 'target']]
a_test_dw_df = a_test_dw_df[['lemmatized', 'target']]
a_test_ndw_df = data_prep(test_ndw_df)
a_test_ndw_df_full = a_test_ndw_df[['case_templ', 'lemmatized', 'target']]
a_test_ndw_df = a_test_ndw_df[['lemmatized', 'target']]

In [91]:
a_half_test_dw_df = a_test_dw_df.sample(n=5, random_state=14)
a_half_test_ndw_df = a_test_ndw_df.sample(n=5, random_state=14)
a_half_test = pd.concat([a_half_test_dw_df, a_half_test_ndw_df])
a_half_test['lemmatized'] = [' '.join(x) for x in a_half_test['lemmatized']]
a_half_test.head(5)

Unnamed: 0,lemmatized,target
3,obama another deadbeat dad,1
9,another example dangerous impact genderist age...,1
0,street overrun dindus,1
5,demographic small town might call urban,1
4,fit right cosmopolitan agenda,1


In [92]:
a_Xy_test = pd.concat([X_test_a, y_test_a], axis=1)
a_Xy_test_new = pd.concat([a_Xy_test, a_half_test])
a_new_X_test = a_Xy_test_new['lemmatized']
a_new_y_test = a_Xy_test_new['target']

#### Processing in Ngrams Style:

In [93]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def ngram_prep(data):

    # tokenization
    data['case_templ_token'] = data['case_templ'].apply(lambda x: tokenizer.tokenize(x))

    # Removing Punctuation
    def remove_punctuation(tokens):
        clean_tokens = [token for token in tokens if token not in punctuation]
        return clean_tokens
    data['case_templ_clean'] = data['case_templ_token'].apply(remove_punctuation)

    # Lemmatize
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    data['lemmatized'] = data['case_templ_clean'].apply(lemmatize_tokens)

    return data

In [94]:
b_test_dw_df = ngram_prep(test_dw_df)
b_test_dw_df_full = b_test_dw_df[['case_templ', 'lemmatized', 'target']]
b_test_dw_df = b_test_dw_df[['lemmatized', 'target']]
b_test_ndw_df = ngram_prep(test_ndw_df)
b_test_ndw_df_full = b_test_ndw_df[['case_templ', 'lemmatized', 'target']]
b_test_ndw_df = b_test_ndw_df[['lemmatized', 'target']]

In [95]:
b_half_test_dw_df = b_test_dw_df.sample(n=5, random_state=14)
b_half_test_ndw_df = b_test_ndw_df.sample(n=5, random_state=14)
b_half_test = pd.concat([b_half_test_dw_df, b_half_test_ndw_df])
b_half_test['lemmatized'] = [' '.join(x) for x in b_half_test['lemmatized']]
b_half_test.head(5)

Unnamed: 0,lemmatized,target
3,obama is another deadbeat dad,1
9,another example of the dangerous impact of the...,1
0,our street are now overrun with dindus,1
5,the demographic of my small town is now what y...,1
4,this fit right into the cosmopolitan agenda,1


In [96]:
b_Xy_test = pd.concat([X_test_b, y_test_b], axis=1)
b_Xy_test_new = pd.concat([b_Xy_test, b_half_test])
b_new_X_test = b_Xy_test_new['lemmatized']
b_new_y_test = b_Xy_test_new['target']

## Performance Comparison

###### apr scores function

In [97]:
from sklearn import metrics

def apr(y_pred, y_real):  # function to calculate the accuracy, precision and recall
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1

#### a predictions

In [98]:
a_new_X_test_vec = vectorizer_a.transform(a_new_X_test)
a_new_y_pred_test = classifier_a.predict(a_new_X_test_vec)
a_new_acc = accuracy_score(a_new_y_test, a_new_y_pred_test)
a_nnew_X_test = a_half_test['lemmatized']
a_nnew_y_test = a_half_test['target']
a_nnew_X_test_vec = vectorizer_a.transform(a_nnew_X_test)
a_nnew_y_pred_test = classifier_a.predict(a_nnew_X_test_vec)
a_nnew_acc = accuracy_score(a_nnew_y_test, a_nnew_y_pred_test)


#### b predictions

In [99]:
b_new_X_test_vec = vectorizer_b.transform(b_new_X_test)
b_new_y_pred_test = classifier_b.predict(b_new_X_test_vec)
b_new_acc = accuracy_score(b_new_y_test, b_new_y_pred_test)
b_nnew_X_test = b_half_test['lemmatized']
b_nnew_y_test = b_half_test['target']
b_nnew_X_test_vec = vectorizer_b.transform(b_nnew_X_test)
b_nnew_y_pred_test = classifier_b.predict(b_nnew_X_test_vec)
b_nnew_acc = accuracy_score(b_nnew_y_test, b_nnew_y_pred_test)

#### Summary

In [100]:
print('Accuracy of large a test set is:', ' ', a_new_acc)
print('Accuracy of small a test set is:', ' ', a_nnew_acc)
print('Accuracy of large b test set is:', ' ', b_new_acc)
print('Accuracy of small b test set is:', ' ', b_nnew_acc)

Accuracy of large a test set is:   0.8998005982053838
Accuracy of small a test set is:   0.9
Accuracy of large b test set is:   0.9127617148554337
Accuracy of small b test set is:   0.7


#### Sentence by Sentence breakdown

In [101]:
half_test_dw_df_full = a_test_dw_df_full.sample(n=5, random_state=14)
half_test_ndw_df_full = a_test_ndw_df_full.sample(n=5, random_state=14)
half_test_full = pd.concat([half_test_dw_df_full, half_test_ndw_df_full])
half_test_full.drop(columns='lemmatized', inplace=True)
half_test_full

Unnamed: 0,case_templ,target
3,obama is another deadbeat dad,1
9,another example of the dangerous impact of the...,1
0,our streets are now overrun with dindus,1
5,the demographic of my small town is now what y...,1
4,this fits right into the cosmopolitan agenda,1
3,politicians are right scumbags,0
9,"ah, a cup of tea, the true great british pasttime",0
0,I really hate what the council is doing to our...,0
5,Modern media like netflix and hulu are so bori...,0
4,people do not believe me when I say turkey is ...,0


In [102]:
half_test_results = half_test_full.copy()
half_test_results['probability_a'] = classifier_a.predict_proba(a_nnew_X_test_vec)[:,1]
half_test_results['prediction_a'] = classifier_a.predict(a_nnew_X_test_vec)
half_test_results['probability_b'] = classifier_b.predict_proba(b_nnew_X_test_vec)[:,1]
half_test_results['prediction_b'] = classifier_b.predict(b_nnew_X_test_vec)
half_test_results['confidence'] = classifier_b.decision_function(b_nnew_X_test_vec)
half_test_results

Unnamed: 0,case_templ,target,probability_a,prediction_a,probability_b,prediction_b,confidence
3,obama is another deadbeat dad,1,0.658373,1,0.256187,0,-1.065882
9,another example of the dangerous impact of the...,1,0.503509,1,0.314979,0,-0.776942
0,our streets are now overrun with dindus,1,0.650972,1,0.901958,1,2.219168
5,the demographic of my small town is now what y...,1,0.005071,0,0.000532,0,-7.538271
4,this fits right into the cosmopolitan agenda,1,0.754338,1,0.861105,1,1.824495
3,politicians are right scumbags,0,0.006082,0,0.126864,0,-1.928973
9,"ah, a cup of tea, the true great british pasttime",0,0.058045,0,0.031259,0,-3.433693
0,I really hate what the council is doing to our...,0,0.048381,0,0.245147,0,-1.124664
5,Modern media like netflix and hulu are so bori...,0,0.045789,0,0.002197,0,-6.118538
4,people do not believe me when I say turkey is ...,0,0.029317,0,0.00047,0,-7.662238


In [103]:
half_test_results.to_csv('LogReg_ngrams.csv')