# Training a SVM model without pre-training
for Policy vs. Politics prediction

In [1]:
import os

os.chdir('../')

Make sure that your current working directory (cwd) is `ReproducingAugSS/AugmentedSocialScientist/`

In [3]:
#os.getcwd() 

In [4]:
from PATHS import ENDOEXO_ASS, ENDOEXO_GS

In [5]:
import pandas as pd
from random import randint

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_recall_fscore_support

import statsmodels.stats.api as sms

## Preprocessing 

In [6]:
def process(path):
    df = pd.read_csv(path)
    df.labels = df.labels.apply(eval)
    
    data = []
    for i in range(df.shape[0]):
        text = df.loc[i,'text']
        labels = df.loc[i, 'labels']
        for j in range(len(labels)):
            label = labels[j]
            data.append({'sentence':text[label[0]:label[1]],
                         'label':label[2]}) 
            
    data = pd.DataFrame(data)
    dict_label = {'autre':0, 'endogène':1,'exogène':2}
    data.label = data.label.apply(lambda x: dict_label[x])
    
    return data

In [10]:
#train = process('../../datasets/AugmentedSocialScientist/all_train_and_gs/endoexo/train/endoexo_train_ass.csv')
train = process(ENDOEXO_ASS)
gs = process(ENDOEXO_GS)

gs['label_per_char'] = gs.apply(lambda row: [row['label']]*len(row['sentence']) ,axis=1)

## Run 100 randomly training of a SVM model

In [300]:
list_scores = []

for n_exp in tqdm(range(100)):
    seed = randint(0,10000)

    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge',penalty='l2',alpha=1e-3, random_state=seed))
    ])



    text_clf.fit(train.sentence.values, train.label.values)
    pred = text_clf.predict(gs.sentence.values)

    gs['pred'] = pred

    gs['pred_per_char'] = gs.apply(lambda row: [row['pred']]*len(row['sentence']) ,axis=1)


    # character-level scores
    scores = precision_recall_fscore_support(gs['label_per_char'].sum(), gs['pred_per_char'].sum())

    score_type = ['precision','recall','f1']
    cat = ['other','politics','policy']

    dict_scores = {'random_state':seed}
    for i in range(3):
        for j in range(3):
            dict_scores[score_type[i]+'_'+cat[j]] = scores[i][j]
            
    list_scores.append(dict_scores)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [301]:
df_scores = pd.DataFrame(list_scores)
df_scores['f1 policy vs. politics'] = df_scores[['f1_politics','f1_policy']].mean(axis=1)

In [304]:
df_scores.to_csv('./train/svm_scores.csv', index=False)

## Computing performance scores

In [12]:
df_scores = pd.read_csv('./train/svm_scores.csv')

In [14]:
recap = pd.concat([pd.DataFrame(df_scores.mean()).transpose(), df_scores.agg(lambda g: sms.DescrStatsW(g).tconfint_mean())])
recap = recap.drop('random_state', axis=1)
recap.index = ['mean', '2.5% CI', '97.5% CI']

recap

Unnamed: 0,precision_other,precision_politics,precision_policy,recall_other,recall_politics,recall_policy,f1_other,f1_politics,f1_policy,f1 policy vs. politics
mean,1.0,0.603744,0.708652,0.047232,0.59813,0.781423,0.090184,0.600852,0.743229,0.67204
2.5% CI,1.0,0.60295,0.707854,0.046582,0.595681,0.780021,0.088992,0.599574,0.742708,0.671314
97.5% CI,1.0,0.604538,0.70945,0.047881,0.600578,0.782826,0.091376,0.602131,0.74375,0.672767
