# Train SVMs

In [None]:
import pickle
from gensim.models.wrappers import FastText
from tools import processing,wemb_utils
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import pathlib
import numpy as np

nlp = spacy.load("en_core_web_sm")
wemb_model = FastText.load_fasttext_format('../models/language_models/fastai/cc.en.300.bin')

In [None]:
corpus = "combined/" # Options: "combined/" or "stories/"

In [None]:
dataset_df = pd.read_pickle("../data/" + corpus + "train.pkl")

In [None]:
def test_classifier(X,y,SVM):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    classifier = SVM.fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    return f1_score(y_test, y_pred, labels=np.unique(y_pred))

In [None]:
def lemmatize(text):
    doc = nlp(text)
    lemmas = " ".join([x.lemma_ for x in doc])
    return lemmas

In [None]:
dFolders = {"targetExpression": "../models/classifiers/" + corpus + "targetExpression/",
            "context3wmasked": "../models/classifiers/" + corpus + "context3wmasked/",
            "context3w": "../models/classifiers/" + corpus + "context3w/"}

for col in dFolders:

    print(col, dataset_df.shape)
    pathlib.Path(dFolders[col]).mkdir(parents=True, exist_ok=True)

    X = dataset_df[col].tolist()
    y = dataset_df["animated"].tolist()
    y = [int(y_val) for y_val in y]

    # SVM:
    SVM = svm.SVC(kernel = "linear", C=1, probability=True)

    # TFIDF
    vectorizer = TfidfVectorizer(min_df=1)
    
    X_lemma = [lemmatize(x) for x in X]
    X_tfidf = vectorizer.fit_transform(X_lemma)

    pickle.dump(vectorizer,open(dFolders[col] + "tfidf.pkl","wb"))

    print(test_classifier(X_tfidf,y,SVM))

    classifier = SVM.fit(X_tfidf, y)
    with open(dFolders[col] + "tfidf_svm.pkl", 'wb') as f:
        pickle.dump(classifier, f)

    # WEMB
    X_emb = [wemb_utils.sent_embedding(x,wemb_model) for x in X]

    print(test_classifier(X_emb,y,SVM))

    classifier = SVM.fit(X_emb, y)
    with open(dFolders[col] + "wemb_svm.pkl", 'wb') as f:
        pickle.dump(classifier, f)
    print()