In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import hstack
import spacy
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_curve, auc

In [4]:
# importing the data
datafolder = '../../data/'
train = datafolder+'train_with_features.csv'
test = datafolder+'test_with_features.csv'
dev = datafolder+'dev_with_features.csv'
df_train = pd.read_csv(train, keep_default_na=False)
df_dev = pd.read_csv(dev, keep_default_na=False)
df_test = pd.read_csv(test, keep_default_na=False)

# advanced SVM 
#### - pos_fw_emo = representation of the text through POS tags, function words, and emotion words (from this representation n-grams (n=1-3) are built, see vectorize below)
#### - count = number of emotion words in a text
#### - emotion_associations = emotion associations from the NRC emotion lexicon
#### - Sentiment score = using siebert/sentiment-roberta-large-english from huggingface we retrieve the sentiment score of the whole sentence
#### - Intent = using mrm8488/t5-base-finetuned-e2m-intent we retrieve the intent of the sentence 

In [None]:
def metrics(gold_label, predicted):
    accuracy = accuracy_score(gold_label, predicted)
    fpr, tpr, thresholds = roc_curve(gold_label, predicted)
    AUROC = auc(fpr, tpr)
    p, r, f1 = precision_recall_fscore_support(gold_label, predicted, average='macro')[:3]
    return pd.DataFrame({'F1-score': f1, 
                        'Precision': p, 
                        'Recall': r, 
                        'Accuracy': accuracy, 
                        'AUROC':AUROC}, index = [0])

In [23]:
vectorizer1 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 3)) # to build n-grams (n=1-3) from the pos_fw_emo representation
vectorizer2 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of emotion associations
vectorizer3 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of tokens (BoW)
vectorizer4 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 4)) # unigrams of intent (BoW)


# combine the features
X_train = hstack((vectorizer1.fit_transform(df_train.pos_fw_emo), vectorizer2.fit_transform(df_train.emotion_associations), df_train[['count']].values, \
                df_train[['sentiment_score']].values, vectorizer3.fit_transform(df_train.tokens), vectorizer4.fit_transform(df_train.intent)), format='csr') 

X_dev = hstack((vectorizer1.transform(df_dev.pos_fw_emo), vectorizer2.transform(df_dev.emotion_associations), df_dev[['count']].values, \
                df_dev[['sentiment_score']].values, vectorizer3.transform(df_dev.tokens), vectorizer4.transform(df_dev.intent) ), format='csr') 

X_test = hstack((vectorizer1.transform(df_test.pos_fw_emo), vectorizer2.transform(df_test.emotion_associations), df_test[['count']].values, \
                df_test[['sentiment_score']].values, vectorizer3.transform(df_test.tokens), vectorizer4.transform(df_test.intent) ), format='csr') 


# X_train = hstack((vectorizer1.fit_transform(df_train.pos_fw_emo), vectorizer2.fit_transform(df_train.emotion_associations), df_train[['count']].values, \
#                 ), format='csr') 

# X_dev = hstack((vectorizer1.transform(df_dev.pos_fw_emo), vectorizer2.transform(df_dev.emotion_associations), df_dev[['count']].values, \
#                 ), format='csr') 

# X_test = hstack((vectorizer1.transform(df_test.pos_fw_emo), vectorizer2.transform(df_test.emotion_associations), df_test[['count']].values, \
#                  ), format='csr') 

Y_train = df_train.label.values
Y_dev = df_dev.label.values
Y_test = df_test.label.values

In [24]:
clf_svc = LinearSVC(max_iter=1000000, C = 10,random_state =456)
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_dev)

#### Performance including all with BoW

In [25]:
metrics(Y_dev, Y_pred)

Unnamed: 0,F1-score,Precision,Recall,Accuracy,AUROC
0,0.50764,0.542501,0.533197,0.536,0.533197


#### Perfromance only stylo&emotion-based

In [22]:
metrics(Y_dev, Y_pred)

Unnamed: 0,F1-score,Precision,Recall,Accuracy,AUROC
0,0.467233,0.496208,0.497096,0.5,0.497096


#### this performance is without BoW including stylo&emotion-based and sentiment-score and intent

In [19]:
metrics(Y_dev, Y_pred)

Unnamed: 0,F1-score,Precision,Recall,Accuracy,AUROC
0,0.488543,0.522326,0.517098,0.52,0.517098


In [7]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.522326,0.517098,0.488543


In [5]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(Y_dev, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results

Unnamed: 0,precision,recall,F1
0,0.542501,0.533197,0.50764


In [6]:
# The prediction including all features has been used (also including BoW)
Y_pred2 = clf_svc.predict(X_test)

df_dev['Advanced_svm_linear_100K_C10'] = Y_pred
df_test['Advanced_svm_linear_100K_C10'] = Y_pred2

df_dev.to_csv(datafolder+'dev_with_features.csv', index=False)
df_test.to_csv(datafolder+'test_with_features.csv', index=False)