# Train SVMs

These are the additional resources you will need, to run this script:
* **FASTTEXT WEMB MODEL:** https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz, unzip and store in `../models/language_models/fastai/`
* **GLOVE MODEL:** http://nlp.stanford.edu/data/glove.6B.zip

In [1]:
import pickle
from gensim.models.wrappers import FastText
from tools import processing,wemb_utils
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import pathlib
import numpy as np

vectorizer = TfidfVectorizer()

nlp = spacy.load("en_core_web_sm")

In [2]:
try:
    if not wemb_model:
        wemb_model = FastText.load_fasttext_format('../models/language_models/fastai/cc.en.300.bin')
except NameError:
    wemb_model = FastText.load_fasttext_format('../models/language_models/fastai/cc.en.300.bin')

In [21]:
corpus = "stories/"
scenario = "unique_"
# Options:
# * "all_"
# * "unique_"

In [22]:
dataset_df = pd.read_pickle("../data/" + corpus + scenario + "train.pkl")
dataset_df['both_masked'] = dataset_df.apply(lambda row: processing.determine_context("maskedSentence", row, "both"), axis=1)
dataset_df['both_unmasked'] = dataset_df.apply(lambda row: processing.determine_context("currentSentence", row, "both"), axis=1)

prp_scenarios = ["_with_prp", "_without_prp"]

In [23]:
dataset_df.head()

Unnamed: 0,storyNumber,prevSentence,currentSentence,maskedSentence,nextSentence,targetExpression,animated,targetIsPRP,context3wmasked,context3w,both_masked,both_unmasked
15083,story12,"`` Well , brother , '' said the serpent , `` i...","Pick out the fattest , grab it by the tail and...","Pick out the fattest , grab it by the [MASK] a...",There was nothing to be done - the gypsy went ...,tail,1,False,it by the [MASK] and drag it,it by the tail and drag it,"`` Well , brother , '' said the serpent , `` i...","`` Well , brother , '' said the serpent , `` i..."
4246,story36,Ivan the peasant 's son took the dragon 's hor...,Night passed and morning came ; the brave yout...,[MASK] passed and morning came ; the brave you...,Ivan rose from the merrymaking and said to his...,Night,0,False,[MASK] passed and morning,Night passed and morning,Ivan the peasant 's son took the dragon 's hor...,Ivan the peasant 's son took the dragon 's hor...
16145,story39,He had not gone very far when he met an ancien...,"`` Good-day , young fellow , '' says the ancie...","`` [MASK] , young fellow , '' says the ancient...","`` Good-day , grandfather , '' says the Fool o...",Good - day,0,False,"`` [MASK] , young fellow","`` Good - day , young fellow",He had not gone very far when he met an ancien...,He had not gone very far when he met an ancien...
4949,story8,`` She is living with a seven-headed dragon . '',"`` Never mind , '' said Frolka , `` we shall g...","`` Never mind , '' said Frolka , `` we shall g...",They said farewell and went on .,twelve - headed dragon,0,False,deal with a [MASK] . '',deal with a twelve - headed dragon . '',`` She is living with a seven-headed dragon . ...,`` She is living with a seven-headed dragon . ...
9681,story98,"-LRB- 3.144 -RRB- The people wept loudly , and...","They said -LRB- to the emigrants -RRB- , `` Th...","They said -LRB- to the [MASK] RRB- , `` There ...","Then Abu Bakr , Umar bin Al-Khattab and Abu 'U...",emigrants,1,False,"-LRB- to the [MASK] RRB- , ``","-LRB- to the emigrants RRB- , ``","-LRB- 3.144 -RRB- The people wept loudly , and...","-LRB- 3.144 -RRB- The people wept loudly , and..."


In [24]:
print(dataset_df.iloc[252].targetExpression)
print(dataset_df.iloc[252].context3wmasked)
print(dataset_df.iloc[252].context3w)
print(dataset_df.iloc[252].both_masked)
print(dataset_df.iloc[252].both_unmasked)
print(dataset_df.iloc[252].currentSentence)
print(dataset_df.iloc[252].maskedSentence)
print(dataset_df.iloc[252].targetIsPRP)

what was that
: Grandmother , [MASK] ?
: Grandmother , what was that ?
When Allah bestowed victory of Khaybar on him , he allotted shares to us from spoils that he allotted to the men . [SEP] He -LRB- Hashraj ibn Ziyad -RRB- said : I said to her : Grandmother , [MASK] ? [SEP] She replied : Dates .
When Allah bestowed victory of Khaybar on him , he allotted shares to us from spoils that he allotted to the men . [SEP] He -LRB- Hashraj ibn Ziyad -RRB- said : I said to her : Grandmother , what was that ? [SEP] She replied : Dates .
He -LRB- Hashraj ibn Ziyad -RRB- said : I said to her : Grandmother , what was that ?
He -LRB- Hashraj ibn Ziyad -RRB- said : I said to her : Grandmother , [MASK] ?
False


In [25]:
def test_classifier(X,y,SVM):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    classifier = SVM.fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    return f1_score(y_test, y_pred, labels=np.unique(y_pred))

In [26]:
def lemmatize(text):
    doc = nlp(text)
    lemmas = " ".join([x.lemma_ for x in doc])
    return lemmas

In [27]:
for prp_scenario in prp_scenarios:

    dFolders = {"targetExpression": "../models/classifiers/" + corpus + scenario + "targetExpression" + prp_scenario + "/",
                "context3wmasked": "../models/classifiers/" + corpus + scenario + "context3wmasked" + prp_scenario + "/",
                "context3w": "../models/classifiers/" + corpus + scenario + "context3w" + prp_scenario + "/",
                "both_masked": "../models/classifiers/" + corpus + scenario + "both_masked" + prp_scenario + "/",
                "both_unmasked": "../models/classifiers/" + corpus + scenario + "both_unmasked" + prp_scenario + "/",
                "currentSentence": "../models/classifiers/" + corpus + scenario + "currentSentence" + prp_scenario + "/",
                "maskedSentence": "../models/classifiers/" + corpus + scenario + "maskedSentence" + prp_scenario + "/"}

    if prp_scenario == "_without_prp":
        dataset_df = dataset_df[dataset_df["targetIsPRP"] == False]

    for col in ["targetExpression", "context3wmasked", "context3w", "both_masked", "both_unmasked", "currentSentence", "maskedSentence"]:
            
        print(col, dataset_df.shape)
        pathlib.Path(dFolders[col]).mkdir(parents=True, exist_ok=True)

        X = dataset_df[col].tolist()
        y = dataset_df["animated"].tolist()
        y = [int(y_val) for y_val in y]

        # SVM:
        SVM = svm.SVC(kernel = "linear", C=1, probability=True)

        # TFIDF
        X_lemma = [lemmatize(x) for x in X]
        X_tfidf = vectorizer.fit_transform(X_lemma)

        pickle.dump(vectorizer,open(dFolders[col] + "tfidf.pkl","wb"))

        print(test_classifier(X_tfidf,y,SVM))

        classifier = SVM.fit(X_tfidf, y)
        with open(dFolders[col] + "tfidf_svm.pkl", 'wb') as f:
            pickle.dump(classifier, f)

        # WEMB
        X_emb = [wemb_utils.sent_embedding(x,wemb_model) for x in X]

        print(test_classifier(X_emb,y,SVM))

        classifier = SVM.fit(X_emb, y)
        with open(dFolders[col] + "wemb_svm.pkl", 'wb') as f:
            pickle.dump(classifier, f)
        print()

targetExpression (8324, 12)
0.9025291275930662
0.9223665223665224

context3wmasked (8324, 12)
0.8177868642984923
0.8289473684210527

context3w (8324, 12)
0.8407880724174654
0.8640595903165734

both_masked (8324, 12)
0.8179089544772387
0.8252997883846697

both_unmasked (8324, 12)
0.8128342245989305
0.8252997883846697

currentSentence (8324, 12)
0.8083333333333333
0.8303592671901023

maskedSentence (8324, 12)
0.8147586555471317
0.8250235183443084

targetExpression (3899, 12)
0.7547568710359407
0.7824267782426778

context3wmasked (3899, 12)
0.5798045602605864
0.45287958115183247

context3w (3899, 12)
0.7259565667011377
0.6832971800433839

both_masked (3899, 12)
0.5438972162740899
0.12802768166089964

both_unmasked (3899, 12)
0.5771248688352572
0.2952815829528158

currentSentence (3899, 12)
0.6102403343782654
0.4974619289340102

maskedSentence (3899, 12)
0.5331882480957562
0.26153846153846155

