# lets try to train a model!
using sklearn to evaluate the preformance of basic models

In [1]:
import csv
import sklearn
import random
import numpy as np
import json
import os

import torch
from torch import nn
import torch.nn.functional as F

from skorch import NeuralNetBinaryClassifier

from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

from sklearn.svm import  SVC, LinearSVC , NuSVC
#from sklearn.svm.sparse import SVC
from sklearn.linear_model import LogisticRegression

from bert_embedding import BertEmbedding
import mxnet as mx

In [2]:
# function to load data
def load_data(filename):
    data = []
    with open(filename, "r") as fp:
        csv_reader = csv.reader(fp)
        for line in csv_reader:
            data.append(line)
    return data

In [3]:
data_dir = "data_3"
output_dir = os.path.join(data_dir, "results_1.json")

In [4]:
# create datasets
train_data = load_data(os.path.join(data_dir, "train.csv"))
val_data = load_data(os.path.join(data_dir, "val.csv"))

In [5]:
# shuffle train data
random.shuffle(train_data)

# create train, val features
X_train = [ex[2] for ex in train_data]
X_test = [ex[2] for ex in val_data]
y_train = [int(ex[3]) for ex in train_data]
y_test = [int(ex[3]) for ex in val_data]

# get a list of all the text
text = X_train + X_test


The next couple of cells are for Tf-idf 

In [6]:
# initialize tf-idf vecotrizer
Tfidf_vect = TfidfVectorizer(analyzer='word', 
                             min_df = 0, 
                             stop_words = 'english', 
                             sublinear_tf=True,
                             max_features = 50000)

In [7]:
Tfidf_vect.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=50000,
                min_df=0, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
tfidf_matrix = Tfidf_vect.transform(text)
feature_names = Tfidf_vect.get_feature_names()
scores = zip(Tfidf_vect.get_feature_names(),
             np.asarray(tfidf_matrix.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores:
    print("{0:50} Score: {1}".format(item[0], item[1]))

people                                             Score: 22.025489297019988
world                                              Score: 17.356644598455222
like                                               Score: 14.946570451496445
don                                                Score: 14.362663446933666
just                                               Score: 14.34688994995725
state                                              Score: 14.171040300976246
live                                               Score: 13.407570517478696
know                                               Score: 13.139222484812205
life                                               Score: 13.05926957391031
muslims                                            Score: 12.420645841095801
allah                                              Score: 11.816428307054245
time                                               Score: 11.760869265429669
country                                            Score: 11.442150241677437
i

available                                          Score: 0.5974448586642037
recognition                                        Score: 0.5968488218617162
improve                                            Score: 0.5968202020722322
emotional                                          Score: 0.5967588404538671
burning                                            Score: 0.5965767538518265
sunni                                              Score: 0.5963405061513198
liberal                                            Score: 0.5956861420164965
pinko                                              Score: 0.5956861420164965
punahou                                            Score: 0.5951423873330258
appear                                             Score: 0.5947777228786829
eye                                                Score: 0.5944566089754115
commerce                                           Score: 0.5939166106758078
payment                                            Score: 0.593169036191523


blocker                                            Score: 0.3689864277370985
ip                                                 Score: 0.3689864277370985
rm                                                 Score: 0.3689864277370985
safely                                             Score: 0.3689864277370985
faced                                              Score: 0.3685754475844079
payed                                              Score: 0.3685228233403732
paper                                              Score: 0.3682246109919727
opinions                                           Score: 0.36804008892413687
murtad                                             Score: 0.3678911366266609
abrupt                                             Score: 0.3677383616029437
educationally                                      Score: 0.3674466416691752
socially                                           Score: 0.3674466416691752
educations                                         Score: 0.367086140101066

moguls                                             Score: 0.19609228629489947
mombasa                                            Score: 0.19609228629489947
reverence                                          Score: 0.19609228629489947
ueg                                                Score: 0.19609228629489947
wreckage                                           Score: 0.19609228629489947
arabism                                            Score: 0.19600676517746826
hafez                                              Score: 0.19600676517746826
inherited                                          Score: 0.19600676517746826
overly                                             Score: 0.19600676517746826
sharks                                             Score: 0.19600676517746826
swam                                               Score: 0.19600676517746826
unenviable                                         Score: 0.19600676517746826
waters                                             Score: 0.1960

# Get TF-IDF scores for violent

In [None]:
# find tf-idf for violent people
Tfidf_vect_violent = TfidfVectorizer(analyzer='word', 
                             min_df = 0, 
                             stop_words = 'english', 
                             sublinear_tf=True,
                             max_features = 50000)
violent_text = [ex[2] for ex in train_data if int(ex[3]) == 1] + [ex[2] for ex in val_data if int(ex[3]) == 1]
tfidf_matrix = Tfidf_vect_violent.fit_transform(violent_text)
feature_names = Tfidf_vect_violent.get_feature_names()
scores = zip(Tfidf_vect_violent.get_feature_names(),
             np.asarray(tfidf_matrix.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores[:50]:
    print("{0:50} Score: {1}".format(item[0], item[1]))

# Get TF-IDF scores for violent

In [None]:
# find tf-idf for nonviolent
Tfidf_vect_nonviolent = TfidfVectorizer(analyzer='word', 
                             min_df = 0, 
                             stop_words = 'english', 
                             sublinear_tf=True,
                             max_features = 50000)
nonviolent_text = [ex[2] for ex in train_data if int(ex[3]) == 0] + [ex[2] for ex in val_data if int(ex[3]) == 0]
tfidf_matrix = Tfidf_vect_nonviolent.fit_transform(nonviolent_text)
feature_names = Tfidf_vect_nonviolent.get_feature_names()
scores = zip(Tfidf_vect_nonviolent.get_feature_names(),
             np.asarray(tfidf_matrix.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores[:50]:
    print("{0:50} Score: {1}".format(item[0], item[1]))

# Create input features

In [9]:
# get tf-idf input features
rep = "TF_IDF"
X_train_features = Tfidf_vect.transform(X_train)
X_test_features = Tfidf_vect.transform(X_test)

In [None]:
# get BERT embeddings
rep = "BERT"
ctx = mx.gpu(1)
bert_embedding = BertEmbedding()
X_train_bert = bert_embedding(X_train)
print('almost there!')
X_test_bert = bert_embedding(X_test)
print('done!')

In [None]:
len(X_test_bert[0][1][0])

In [None]:
X_train_features = np.array([np.mean(ex[1], axis=0) for ex in X_train_bert])
X_test_features = np.array([np.mean(ex[1], axis=0) for ex in X_test_bert])

In [None]:
X_test_features.shape[1]

# run models

In [11]:
# initilaize results
results = dict()

In [10]:
# logistic regression
model = "LR"
log_classifier = LogisticRegression(solver='liblinear')
log_classifier.fit(X_train_features,y_train)
y_pred_log = log_classifier.predict(X_test_features)

In [13]:
# svc- sigmoid
model = "svc-sigmoid"
svc_classifer = SVC(kernel='sigmoid', 
                    gamma='scale',
                    coef0=0,
                    C=1,
                    tol=.01,
                    random_state=None)
svc_classifer.fit(X_train_features,y_train)
y_pred_log = svc_classifer.predict(X_test_features)

In [16]:
# svc- rbf
model = "svc-rbf"
svc_classifer = SVC(kernel='rbf', 
                    gamma='scale',
                    C=1,
                    tol=.01,
                    random_state=None)
svc_classifer.fit(X_train_features,y_train)
y_pred_log = svc_classifer.predict(X_test_features)

In [19]:
# svc-linear
model = "svc-linear"
svc_classifer = SVC(kernel='linear', 
                    gamma='scale',
                    C=1,
                    tol=.01,
                    random_state=None)
svc_classifer.fit(X_train_features,y_train)
y_pred_log = svc_classifer.predict(X_test_features)

Scorch nns now!!

In [None]:
# nn for bert embeddings
model = "NN"
class MyModule(nn.Module):
    def __init__(self, num_units=1000):
        super(MyModule, self).__init__()
        self.dense1 = nn.Linear(768, 768)
        self.dense2 = nn.Linear(768, 100)
        self.dense0 = nn.Linear(100, 1)
        self.relu1 = nn.ReLU()

    def forward(self, X, **kwargs):
        X = self.dense1(X)
        X = self.relu1(X)
        X = self.dense2(X)
        X = self.relu1(X)
        X = self.dense0(X)
        return X.squeeze(1)

In [22]:
# nn for tf-idf vectors
model = "NN"
class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense1 = nn.Linear(X_test_features.shape[1], 768)
        self.dense2 = nn.Linear(768, 100)
        self.dense0 = nn.Linear(100, 1)
        self.relu1 = nn.ReLU()

    def forward(self, X, **kwargs):
        X = self.dense1(X)
        X = self.relu1(X)
        X = self.dense2(X)
        X = self.relu1(X)
        X = self.dense0(X)
        return X.squeeze(1)

In [23]:
# initialize nn
net = NeuralNetBinaryClassifier(
    MyModule,
    max_epochs=220,
    lr=.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
)

In [24]:
# for tf-idf
net.fit(torch.from_numpy(X_train_features.toarray()).float(), torch.tensor(y_train, dtype=torch.float))
y_pred_log = net.predict(torch.from_numpy(X_test_features.toarray()).float())


  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.6923[0m       [32m0.5429[0m        [35m0.6913[0m  0.3077
      2        [36m0.6909[0m       0.5429        [35m0.6905[0m  0.3096
      3        [36m0.6902[0m       0.5429        [35m0.6901[0m  0.3178
      4        [36m0.6898[0m       0.5429        [35m0.6898[0m  0.3144
      5        [36m0.6896[0m       0.5429        [35m0.6897[0m  0.3125
      6        [36m0.6896[0m       0.5429        [35m0.6895[0m  0.2916
      7        [36m0.6893[0m       0.5429        [35m0.6894[0m  0.2693
      8        [36m0.6893[0m       0.5429        0.6894  0.2928
      9        [36m0.6892[0m       0.5429        [35m0.6894[0m  0.2142
     10        [36m0.6891[0m       0.5429        [35m0.6893[0m  0.2104
     11        [36m0.6890[0m       0.5429        [35m0.6893[0m  0.2347
     12        [36m0.6890[0m       0.5429        [35m0.689

    112        [36m0.6804[0m       0.5429        [35m0.6833[0m  0.2956
    113        [36m0.6802[0m       0.5429        [35m0.6831[0m  0.2016
    114        [36m0.6797[0m       0.5429        [35m0.6828[0m  0.2013
    115        [36m0.6792[0m       0.5429        [35m0.6825[0m  0.2177
    116        [36m0.6789[0m       0.5429        [35m0.6823[0m  0.2037
    117        [36m0.6784[0m       0.5429        [35m0.6819[0m  0.2193
    118        [36m0.6779[0m       0.5429        [35m0.6816[0m  0.2132
    119        [36m0.6774[0m       0.5429        [35m0.6812[0m  0.2340
    120        [36m0.6771[0m       0.5429        [35m0.6809[0m  0.2410
    121        [36m0.6764[0m       0.5429        [35m0.6805[0m  0.2217
    122        [36m0.6758[0m       0.5429        [35m0.6801[0m  0.2553
    123        [36m0.6751[0m       0.5429        [35m0.6796[0m  0.2079
    124        [36m0.6746[0m       0.5429        [35m0.6792[0m  0.2034
    125        [36m0.673

In [None]:
# for bert
net.fit(torch.from_numpy(X_train_features).float(), torch.tensor(y_train, dtype=torch.float))
y_pred_log = net.predict(torch.from_numpy(X_test_features).float())

# Evaluate models

In [None]:
y_pred_log 

In [None]:
# print results
print(classification_report(y_test, y_pred_log))

In [20]:
# save results
label = "{}_{}".format(rep, model)
accuracy = accuracy_score(y_test, list(y_pred_log))
precision = precision_score(y_test, list(y_pred_log))
recall = recall_score(y_test, list(y_pred_log))
f1 = f1_score(y_test, list(y_pred_log))
                    
results[label] = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1}

with open(output_dir, "w") as f:
    json.dump(results, f, indent=4)

In [21]:
results

{'TF_IDF_LR': {'accuracy': 0.6866952789699571,
  'precision': 0.5970149253731343,
  'recall': 0.8080808080808081,
  'f1': 0.6866952789699571},
 'TF_IDF_svc-sigmoid': {'accuracy': 0.6952789699570815,
  'precision': 0.6014492753623188,
  'recall': 0.8383838383838383,
  'f1': 0.7004219409282701},
 'TF_IDF_svc-rbf': {'accuracy': 0.6909871244635193,
  'precision': 0.5944055944055944,
  'recall': 0.8585858585858586,
  'f1': 0.7024793388429751},
 'TF_IDF_svc-linear': {'accuracy': 0.6866952789699571,
  'precision': 0.5955882352941176,
  'recall': 0.8181818181818182,
  'f1': 0.6893617021276596}}