In [1]:
import os
import pickle as pkl
import sys

import numpy as np
import random as rn
import argparse
from itertools import count
from collections import defaultdict

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn import preprocessing

from tqdm import tqdm
from imblearn.over_sampling import SMOTE

## Fix random seeds for reproducibility

In [2]:
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(12456)
rn.seed(12345)
torch.manual_seed(1234)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Tools

In [3]:
# Load labels from CSV file
def load_labels(files):

	train_labels = files[0]
	devel_labels = files[1]
	
	y_train = pd.read_csv(train_labels, sep=',')['label'].values
	y_devel = pd.read_csv(devel_labels, sep=',')['label'].values
	
	return y_train, y_devel



# Load data from CSV file
def load_data(files):
	train_file = files[0]
	devel_file = files[1]
	test_file  = files[2]
	
	X_train = pd.read_csv(train_file, header=None, index_col=False, sep=';').values
	X_devel = pd.read_csv(devel_file, header=None, index_col=False, sep=';').values
	X_test  = pd.read_csv(test_file, header=None, index_col=False, sep=';').values
	
	return X_train, X_devel, X_test


# Saves predictions
def save_predictions(file_list_path, predictions, output_path):
	files = pd.read_csv(file_list_path)
	file_ids = files.file_id.values
	pred_df = pd.DataFrame({'file_id': file_ids, 'predictions': predictions})
	pred_df.to_csv(output_path, index=False)


# plot training history
def plot_training_history(epochs, plottable, ylabel='', name=''):
	plt.clf()
	plt.xlabel('Epoch')
	plt.ylabel(ylabel)
	if len(plottable) == 1:
		plt.plot(np.arange(epochs), plottable[0], label='Loss')
	elif len(plottable) == 2:
		plt.plot(np.arange(epochs), plottable[0], label='Acc')
		plt.plot(np.arange(epochs), plottable[1], label='UAR')
	else:
		raise ValueError('plottable passed to plot function has incorrect dim.')
	plt.legend()
	plt.savefig('%s.png' % (name), bbox_inches='tight')


## SVM Funcs

In [4]:
# Train Model
def train_svm(X, y, parms, weights):
    #if parms['kernel'] != 'linear':
       # clf = SVC(kernel=parms['kernel'], C=parms['C'], degree=parms['d'], gamma=parms['g'], class_weight=parms["cw"], verbose=0, random_state=12345)
    #else:
        #clf = LinearSVC(C=parms['C'], class_weight=parms["cw"], max_iter=100000, verbose=0, random_state=12345)	
    
    clf = SVC(verbose=0, random_state=12345)
    clf = GridSearchCV(clf, parms, n_jobs=-1, scoring="recall")
    
    if not isinstance(weights, int):
        clf.fit(X, y, sample_weight=weights);
    else:
        clf.fit(X, y);
    
    return clf

# Compute Predictions and Metrics
def test_svm(X, y, clf):
    preds = clf.predict(X)

    # In this case we are using as metrics the average F1 Score, Precision and Recall.
    # If we want to learn better how the model is behaving for each class we can remove the "average" from the function's inputs
    prf = precision_recall_fscore_support(y, preds, labels=[0,1], average='macro')
    accuracy = accuracy_score(y, preds)

    return prf, accuracy, preds


## Dirs

In [51]:
directory = "." # Full path to your current folder
# Label files
labels_train = "../lab3_part1/corpus/labels/train_labels.csv"
labels_devel = "../lab3_part1/corpus/labels/dev_labels.csv"
test_list_path = "../lab3_part1/corpus/labels/test_file_list.txt"

label_files = [labels_train, labels_devel]

#### eGeMAPS feature set

In [55]:
feature_set = "egemaps" # name of the folder with the feature set

# Data files is11_train_data.csv
data_train = directory + '/features/' + feature_set + '_scaled_train_trimmed.csv'
data_devel = directory + '/features/' + feature_set + '_scaled_dev_trimmed.csv'
data_test  = directory + '/features/' + feature_set + '_scaled_test_trimmed.csv'
data_files = [data_train, data_devel, data_test]

X_train, X_devel, X_test = load_data(data_files)
y_train, y_devel = load_labels(label_files)

In [56]:
sm = SMOTE(random_state=12345)

In [57]:
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [58]:
df = pd.read_csv(labels_train)

In [14]:
param_grid = {'C': [100, 10, 1], 'gamma': [0.001, 0.01, 0.1] , 'kernel': ['rbf'], 'class_weight' : ["balanced", {0: 1, 1: 1}]}

In [24]:
model = train_svm(X_train_res, y_train_res, param_grid, 1)

KeyboardInterrupt: 

In [45]:
model.best_estimator_

SVC(C=100, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=12345, shrinking=True,
    tol=0.001, verbose=0)

In [25]:
pkl.dump(model, open(f"{feature_set}_svm_model_rbf_100_0.01.pkl", 'wb'))

In [59]:
model = pkl.load(open(f"{feature_set}_svm_model_rbf_100_0.01.pkl", 'rb'))

In [60]:
train_prf, train_accuracy, train_preds = test_svm(X_train_res, y_train_res, model)

In [61]:
train_prf

(0.990492930009059, 0.9904615384615385, 0.9904613858437119, None)

In [29]:
train_accuracy

0.9904615384615385

In [62]:
dev_prf, dev_accuracy, dev_preds = test_svm(X_devel, y_devel, model)

In [63]:
dev_accuracy

0.520537714712472

In [64]:
dev_prf

(0.5024612693653173, 0.5026090327676216, 0.5000593202596134, None)

In [34]:
prediction_test = model.predict(X_test)

In [35]:
df = pd.read_csv(test_list_path)

In [36]:
save_predictions(labels_devel, dev_preds, "egemaps_dev_result_svm.csv")
save_predictions(test_list_path, prediction_test, "egemaps_test_result_svm.csv")

#### IS11 feature set

In [65]:
feature_set = "is11" # name of the folder with the feature set

data_train = directory + '/features/' + feature_set + '_scaled_train_trimmed.csv'
data_devel = directory + '/features/' + feature_set + '_scaled_dev_trimmed.csv'
data_test  = directory + '/features/' + feature_set + '_scaled_test_trimmed.csv'
data_files = [data_train, data_devel, data_test]

X_train, X_devel, X_test = load_data(data_files)
y_train, y_devel = load_labels(label_files)

In [66]:
sm = SMOTE(random_state=12345)

In [67]:
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [68]:
param_grid = {'C': [1], 'gamma': [0.01] , 'kernel': ['linear'], 'class_weight' : ["balanced"]}

In [34]:
model = train_svm(X_train_res, y_train_res, param_grid, 1)

In [71]:
model.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
    max_iter=-1, probability=False, random_state=12345, shrinking=True,
    tol=0.001, verbose=0)

In [36]:
pkl.dump(model, open(f"{feature_set}_svm_model.pkl", 'wb'))

In [70]:
model = pkl.load(open(f"{feature_set}_svm_model.pkl", 'rb'))

In [72]:
train_prf, train_accuracy, train_preds = test_svm(X_train, y_train, model)

In [79]:
train_accuracy

1.0

In [80]:
train_prf

(1.0, 1.0, 1.0, None)

In [74]:
dev_prf, dev_accuracy, dev_preds = test_svm(X_devel, y_devel, model)

In [76]:
dev_prf

(0.5025163834053261, 0.5027017395157096, 0.4974962945849505, None)

In [77]:
dev_accuracy

0.5123226288274833

In [43]:
prediction_test = model.predict(X_test)

In [44]:
df = pd.read_csv(test_list_path)

In [46]:
save_predictions(labels_devel, dev_preds, "is11_dev_result_svm.csv")

In [47]:
save_predictions(test_list_path, prediction_test, "is11_test_result_svm.csv")

### Dummy Classifier

In [345]:
from sklearn.dummy import DummyClassifier

In [346]:
dummy = DummyClassifier(strategy="prior")

In [347]:
dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='prior')

In [348]:
train_prf, train_accuracy, train_preds = test_svm(X_train, y_train, dummy)

  _warn_prf(average, modifier, msg_start, len(result))


In [349]:
train_accuracy

0.6025213199851687

In [350]:
train_accuracy

0.6025213199851687

In [351]:
train_prf

(0.30126065999258433, 0.5, 0.3759833410458121, None)

In [352]:
dev_prf, dev_accuracy, dev_preds = test_svm(X_devel, y_devel, dummy)

In [353]:
dev_accuracy

0.6355489171023152

In [354]:
dev_prf

(0.3177744585511576, 0.5, 0.38858447488584474, None)