# SVM 2

In [None]:
# !pip install transformers

In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
import logging
import transformers as ppb
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
np.random.RandomState(123)

RandomState(MT19937) at 0x7FF70AFE16B0

## load data

In [3]:
df = pd.read_csv('../data/cleandata/twitter-c.csv')
df = df.drop(labels=['Unnamed: 0'], axis=1)

print(df.head())
print(df.shape, end='\n\n')
print(df.iat[0, 1])

   class                                               text
0      0   As a woman you shouldn't complain about clean...
1      0       momma said no pussy cats inside my doghouse 
2      0      SimplyAddictedToGuys woof woof hot scally lad
3      0                            woof woof and hot soles
4      0     Lemmie eat a Oreo do these dishes One oreo Lol
(5593, 2)

 As a woman you shouldn't complain about cleaning up your house as a man you should always take the trash out 


### reduce corpus size for convenience

In [4]:
df = df[:2500]

print(df.shape)

(2500, 2)


## load pretrained BERT model

In [5]:
bert_tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased')

## use BERT

### BERT preprocessing

In [6]:
def bert_preprocess(text):
    '''Preprocess steps for BERT: tokenize and pad sentences.
    
    Arguments:
        text (pandas.Series): 1-D array of text to classify.
        
    Returns:
        numpy.ndarray: A 2-D array of sentences, each sentence is
            broken into an array of IDs for BERT.
        numpy.ndarray: A 2-D array to mask padded IDs in a
            sentence.
    '''
    
    # tokenize
    features = text.apply(
        lambda x: bert_tokenizer.encode(x, add_special_tokens=True)
    )

    # pad sentences to make them the same length
    max_len = 0
    for s in features.values:
        max_len = max(len(s), max_len)
    features = np.array(
        [s + [0] * (max_len - len(s)) for s in features.values]
    )
    
    # mask
    attention_mask = np.where(features != 0, 1, 0)
    
    return features, attention_mask

### BERT classification

BERT can only classify sentences of up to 512 tokens (roughly 200 words). Longer sentences would have to be broken up.

In [7]:
def bert_classify(text):
    '''Use BERT to classify sentences. I think classes are
        pretty much just numbers in a linear output space.
        
    Arguments:
        text (pandas.Series): 1-D array of text to classify.
            
    Returns:
        pandas.DataFrame: Embeddings for each sentence.
    '''
    
    features, attention_mask = bert_preprocess(text)
    features = torch.tensor(features)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = bert_model(features, attention_mask=attention_mask)

    # return classes for each sentence
    return pd.DataFrame(data=last_hidden_states[0][:, 0,:])

### this step takes a while

In [8]:
bert_classes = bert_classify(df['text'])
print(bert_classes.shape)

(2500, 768)


## Split data

In [9]:
x_, x_test, y_, y_test = train_test_split(
    bert_classes,
    df['class'],
    test_size=0.1,
    stratify=df['class']
)

x_train, x_val, y_train, y_val = train_test_split(
    x_,
    y_,
    test_size=1/9,
    stratify=y_
)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

(2000, 768) (2000,)
(250, 768) (250,)
(250, 768) (250,)


## SVM

todo

1. normalize data
2. improve hyperparameters

In [10]:
model = svm.SVC(kernel='linear')

### train

In [11]:
model.fit(x_train, y_train)
model.score(x_train, y_train)

0.937

In [12]:
def assess_model(model, x, y):
    '''Predict labels with a model and see the classification report.
    
    Arguments:
        model: a model.
        x (pandas.DataFrame): Embeddings for each sentence.
        y (pandas.Series): Classes for each sentence.
        
    Returns:
        pandas.DataFrame: Sentences, predictions, and their true classes.
    '''
    
    y_pred = model.predict(x)

    df_pred = df[df.index.isin(y.index)]
    df_pred = df_pred.assign(pred=y_pred)

    report = classification_report(y, y_pred, output_dict=True)

    print('safe\n', report['0'], end='\n\n')
    print('hate\n', report['1'], end='\n\n')

    return df_pred

### validate

In [13]:
df_val = assess_model(model, x_val, y_val)
print(df_val.head())

safe
 {'precision': 0.875, 'recall': 0.8953488372093024, 'f1-score': 0.8850574712643678, 'support': 172}

hate
 {'precision': 0.7567567567567568, 'recall': 0.717948717948718, 'f1-score': 0.736842105263158, 'support': 78}

    class                                               text  pred
11      0   I'm an early bird and I'm a night owl so I'm ...     1
15      0   this the I play soccer cheat on girls and wea...     0
27      0   10 birds your grandkids may never see thanks ...     0
64      0   Fit lads Nice gear these scally lads n traine...     1
67      0   RAWR My sexy French scally I love him frenchs...     0


### check some misclassifications

In [14]:
shown = 0
print('1 is hate', end='\n\n')
for _, row in df_val.iterrows():
    p = row['pred']
    a = row['class']
    if p != a:
        print('text:', row['text'])
        print(f'predicted {p}, actual {a}', end='\n\n')
        shown += 1
        if shown == 5:
            break

1 is hate

text:  I'm an early bird and I'm a night owl so I'm wise and have worms 
predicted 1, actual 0

text:  Fit lads Nice gear these scally lads n trainers would get it
predicted 1, actual 0

text:  hick and raver is a venn diagram that has a very large intersection 
predicted 1, actual 0

text:  California is full of white trash who moved from Oklahoma
predicted 0, actual 1

text:  FireCashman Why Because I am having to root for the Royals in October Yankees
predicted 1, actual 0



## try more models

In [25]:
model_lin = svm.SVC(kernel='linear', max_iter=1200)
model_lin.fit(x_train, y_train)
print('train accuracy:', model_lin.score(x_train, y_train))
model_lin.score(x_val, y_val)



train accuracy: 0.9445


0.856

## play with robots

In [24]:
def use_model(model):
    s = input()
    while s != 'exit':
        input_bert_class = bert_classify(pd.Series(data=[s]))
        input_pred = model.predict(input_bert_class)
        print('Predicted:', 'safe' if input_pred == 0 else 'hate', end='\n\n')
        s = input()

In [27]:
use_model(model_lin)

if your best pitch for your revolutionary new technology is please don’t criticize it maybe it just sucks
Predicted: safe

it’s not clever, it’s not actually that interesting, and it’s only original because no one was stupid enough to do it before. it’s not some special application of anything, it’s just a demonstration of what happens when you throw impractical, industrial levels of energy at the tiniest problem
Predicted: safe

exit


## pickle

In [None]:
pickle.dump(model, open('svm_model_3.sav', 'wb'))

## test model

In [None]:
df_test = assess_model(model, x_test, y_test)
print(df_test.head())