# SVM 2

In [None]:
# !pip install transformers

In [None]:
import torch
import pickle
import numpy as np
import pandas as pd
import transformers as ppb
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
np.random.RandomState(228)

## load data

In [None]:
df = pd.read_csv(
    'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv',
    delimiter='\t',
    header=None
)
df = df.rename(columns={0 : "text", 1: "class"})

print(df.head())
print(df.shape, end="\n\n")
print(df.iat[0, 0])

### reduce corpus size for convenience

In [None]:
df = df[:2500]

print(df.shape)

## load pretrained BERT model

In [None]:
bert_tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased')

## use BERT

### BERT preprocessing

In [None]:
def bert_preprocess(text):
    """Preprocess steps for BERT: tokenize and pad sentences.
    
    Arguments:
        text (pandas.Series): 1-D array of text to classify.
        
    Returns:
        numpy.ndarray: A 2-D array of sentences, each sentence is
            broken into an array of IDs for BERT.
        numpy.ndarray: A 2-D array to mask padded IDs in a
            sentence.
    """
    
    # tokenize
    features = text.apply(
        lambda x: bert_tokenizer.encode(x, add_special_tokens=True)
    )

    # pad sentences to make them the same length
    max_len = 0
    for s in features.values:
        max_len = max(len(s), max_len)
    features = np.array(
        [s + [0] * (max_len - len(s)) for s in features.values]
    )
    
    # mask
    attention_mask = np.where(features != 0, 1, 0)
    
    return features, attention_mask

### BERT classification

BERT can only classify sentences of up to 512 tokens (roughly 200 words). Longer sentences would have to be broken up.

In [None]:
def bert_classify(text):
    """Use BERT to classify sentences. I think classes are
        pretty much just numbers in a linear output space.
        
    Arguments:
        text (pandas.Series): 1-D array of text to classify.
            
    Returns:
        numpy.ndarray: Class of each sentence.
    """
    
    features, attention_mask = bert_preprocess(text)
    features = torch.tensor(features)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = bert_model(features, attention_mask=attention_mask)

    # return classes for each sentence
    return last_hidden_states[0][:, 0,:].numpy()

### this step takes a while

In [None]:
bert_classes = bert_classify(df["text"])
print(bert_classes.shape)

## SVM

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    bert_classes,
    df["class"],
    test_size=0.2
)

svm_model = svm.SVC(kernel="linear")

### train SVM

In [None]:
svm_model.fit(x_train, y_train)

### test SVM

In [None]:
y_hat = svm_model.predict(x_test)

df_test = df[df.index.isin(y_test.index)]
df_test = df_test.assign(pred=y_hat)

report = classification_report(y_test, y_hat, output_dict=True)

print("positive\n", report["1"], end="\n\n")
print("negative\n", report["0"], end="\n\n")
print(df_test.head())

### check some misclassifications

In [None]:
shown = 0
print("1 is positive", end="\n\n")
for _, row in df_test.iterrows():
    p = row["pred"]
    a = row["class"]
    if p != a:
        print("text:", row["text"])
        print(f"predicted {p}, actual {a}", end="\n\n")
        shown += 1
        if shown == 5:
            break

## play with robots

In [None]:
s = input()
while s != "exit":
    input_bert_class = bert_classify(pd.Series(data=[s]))
    input_pred = svm_model.predict(input_bert_class)
    print("Predicted: ", "positive" if input_pred == 1 else "negative", end="\n\n")
    s = input()

## pickle

In [None]:
pickle.dump(svm_model, open("svm_model_2.sav", "wb"))