In [None]:
import numpy as np
import pandas as pd
import scipy as sp

df_train = pd.read_csv("IA3-train.csv")
df_val = pd.read_csv("IA3-dev.csv")

# PART 0: Preprocessing

## a)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

df_pos_tweets = df_train[df_train['sentiment'] == 1]
df_pos_tweets

df_neg_tweets = df_train[df_train['sentiment'] == 0]
df_neg_tweets

pos_vectorizer = CountVectorizer()
neg_vectorizer = CountVectorizer()
pos_tweets_token_counts = pos_vectorizer.fit_transform(df_pos_tweets['text'])
neg_tweets_token_counts = neg_vectorizer.fit_transform(df_neg_tweets['text'])

pos_tweets_words = pos_vectorizer.get_feature_names()
neg_tweets_words = neg_vectorizer.get_feature_names()
print(len(pos_tweets_words))
print(len(neg_tweets_words))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

df_pos_tweets = df_val[df_val['sentiment'] == 1]
df_pos_tweets

df_neg_tweets = df_val[df_val['sentiment'] == 0]
df_neg_tweets

pos_vectorizer = CountVectorizer()
neg_vectorizer = CountVectorizer()
pos_tweets_token_counts = pos_vectorizer.fit_transform(df_pos_tweets['text'])
neg_tweets_token_counts = neg_vectorizer.fit_transform(df_neg_tweets['text'])

pos_tweets_words = pos_vectorizer.get_feature_names()
neg_tweets_words = neg_vectorizer.get_feature_names()
print(len(pos_tweets_words))
print(len(neg_tweets_words))

print(len(neg_tweets_words) / (len(neg_tweets_words) + len(pos_tweets_words)))

**Another important note is the imbalance of the training data. The negative comments are present with approximately 0.71 ratio in the training set and 0.72 in the validation set. Initially, we were not aware of the issue untill we observed some odd trends of its number of support vectors when C increases. Further testing on other metrics confirmed that it was indeed an accuracy paradox. In particular, its validation recall is 0.47 while its balanced validation accuracy score (the average of recall obtained on each class) is only 0.73, which is slightly higher than blindly classifying all as negative (that would yield 0.72 validation accuracy).

Accuracy paradox is a situation in which a model has excellent accuracy that only refects the underlying class distribution. It is often caused by imbalance data set, which is very common and expected. For example, the data set for fraudulent transaction classification is probably unbalanced because most of the transactions are not fraudulent.

One of the remedies for this problem is using cost-sensitive training, or class-weighted SVMs, in which costs for each class are different based on the ratio of them. Unsurprisingly, Scikit-learn does support this method to tackle this common problem. Training the model using SVC.fit() with the option class_weight="balanced" resulted in much better performance. Specifically, the model balanced validation accuracy went up to 0.82, whereas its recall increased to 0.65. Its validation accuracy was also improved from 0.89 to 0.92.**

In [None]:
from IA3_skeleton_code import getMaxes

sum_pos_tweets_token_counts = pos_tweets_token_counts.sum(axis=0).tolist()[0]
sum_neg_tweets_token_counts = neg_tweets_token_counts.sum(axis=0).tolist()[0]

pos_tweets_most_freq_indices = getMaxes(sum_pos_tweets_token_counts, 10)
neg_tweets_most_freq_indices = getMaxes(sum_neg_tweets_token_counts, 10)

print("The 10 most frequent words in the positive comments: ")
for i in pos_tweets_most_freq_indices:
    print("\"%s\" occurs %d times" % (pos_tweets_words[i], sum_pos_tweets_token_counts[i]))

print("==============================")

print("The 10 most frequent words in the negetive comments: ")
for i in neg_tweets_most_freq_indices:
    print("\"%s\" occurs %d times" % (neg_tweets_words[i], sum_neg_tweets_token_counts[i]))

These words seem not to semantically correlate with the label of the comments. 
For example, the most frequent words such as "the", "to", "you", and "for" are 
neutral, suggesting neither negativity nor positivity.

## b)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

pos_tfidfvectorizer = TfidfVectorizer(use_idf=True, lowercase=True)
neg_tfidfvectorizer = TfidfVectorizer(use_idf=True, lowercase=True)
pos_tweets_tfidf = pos_tfidfvectorizer.fit_transform(df_pos_tweets['text'])
neg_tweets_tfidf = neg_tfidfvectorizer.fit_transform(df_neg_tweets['text'])

pos_tweets_words = pos_tfidfvectorizer.get_feature_names()
neg_tweets_words = neg_tfidfvectorizer.get_feature_names()

sum_pos_tweets_tfidf = pos_tweets_tfidf.sum(axis=0).tolist()[0]
sum_neg_tweets_tfidf = neg_tweets_tfidf.sum(axis=0).tolist()[0]

pos_tweets_most_freq_indices = getMaxes(sum_pos_tweets_tfidf, 10)
neg_tweets_most_freq_indices = getMaxes(sum_neg_tweets_tfidf, 10)

print("The 10 most frequent words in the positive comments: ")
for i in pos_tweets_most_freq_indices:
    print("\"%s\" occurs %d times" % (pos_tweets_words[i], sum_pos_tweets_tfidf[i]))

print("==============================")

print("The 10 most frequent words in the negetive comments: ")
for i in neg_tweets_most_freq_indices:
    print("\"%s\" occurs %d times" % (neg_tweets_words[i], sum_neg_tweets_tfidf[i]))

# PART 1: LINEAR SVM

## 1. C value results the best validation performance

In [None]:
# from IA3_skeleton_code import trainSVM

from sklearn.svm import SVC

def trainSVM(X_train, y_train, X_val, y_val, c, kernel, deg=3):
    """
    Description: scikit learn linearSVC wrapper
    Param:
        X_train  [in]: training data
        y_train  [in]: training label
        X_val    [in]: validation data
        y_val    [in]: validation label
        c        [in]: Regularization parameter. The strength of the 
                       regularization is inversely proportional to C. Must be 
                       strictly positive.
        kernel   [in]: Kernel type for SVC
        deg      [in]: degree (only for poly kernel) - if not poly, this is 
                       ignored (default param set to sklearn default = 3)
    Return: training accuracy, validation accuracy, and number of SV's (respectively)
    """
    n_train = X_train.shape[0]
    n_val = X_val.shape[0]
    
    svm = SVC(C=c, kernel=kernel, degree=deg, max_iter=25000)
    svm.fit(X_train, y_train)
    
    y_pred_train = svm.predict(X_train)
    y_pred_val = svm.predict(X_val)
    
    acc_train = (n_train - np.count_nonzero(y_pred_train - y_train)) / n_train
    acc_val = (n_val - np.count_nonzero(y_pred_val - y_val)) / n_val
    
    return acc_train, acc_val, svm.n_support_

tfidfvectorizer = TfidfVectorizer(use_idf=True, lowercase=True)

# X_train = tfidfvectorizer.fit_transform(df_train['text']).toarray()
X_train = tfidfvectorizer.fit_transform(df_train['text'])
y_train = df_train['sentiment']

X_val = tfidfvectorizer.transform(df_val['text']).toarray()
y_val = df_val['sentiment']

exp = range(-4,5)
c = list(map(lambda x: 10**(x), exp))

n_val = len(y_val)
acc_train = {}
acc_val = {}
SVMs = {}
for _ in c:
    acc_train[_], acc_val[_], SVMs[_] = trainSVM(X_train, y_train, X_val, y_val, _, 
                                        "linear")

## 2. Trend of training and validation performance in regards with c

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(6,6))
plt.plot(exp, list(acc_train.values()), label="Training")
plt.plot(exp, list(acc_val.values()), label="Validation")

plt.xlabel("i")
plt.ylabel("Accuracy")
plt.title("Accuracy in regards to c = 10^i")
plt.legend()

## 3. Relationship between the number of support vectors and C

In [None]:
from IA3_skeleton_code import plotNSV

plotNSV(exp, SVMs, "Linear SVM", "lsvmNSV.jpg")

Looks like best c values are in the range [0.1,1].

In [None]:
acc_val = {}
acc_train = {}
SVMs = {}
l = 0.1
r = 1

max_depth = 15
cur_depth = 0

while cur_depth <= max_depth:
    if l not in acc_val.keys():
        acc_train[l], acc_val[l], SVMs[l] = trainSVM(X_train, y_train, X_val, 
                                                     y_val, l, "linear")
        
    if r not in acc_val.keys():
        acc_train[r], acc_val[r], SVMs[r] = trainSVM(X_train, y_train, X_val, 
                                                     y_val, r, "linear")
    
    m = (l + r) / 2
    acc_train[m], acc_val[m], SVMs[m] = trainSVM(X_train, y_train, X_val, 
                                                 y_val, m, "linear")
    
    acc_val_max = max(acc_val[m], acc_val[r], acc_val[l])
    
    if acc_val_max == acc_val[m]:
        l = (m + l) / 2
        r = (m + r) / 2
    elif acc_val_max == acc_val[l]:
        r = m
    else:
        l = m
    
    cur_depth += 1

In [None]:
ax1 = plt.figure(figsize=(10,6))
ax1 = plt.subplot(1,2,1)

ax1.scatter(list(acc_val.keys()), list(acc_val.values()))
plt.xlabel("C")
plt.ylabel("Validation accuracy")
plt.title("Validation accuracy in regards to c")

ax2 = plt.subplot(1,2,2)
ax2.set(xlim=(0.7, 1.1), ylim=(0.92, 0.93))
ax2.scatter(list(acc_val.keys()), list(acc_val.values()))
plt.title("Zoomed in")

In [None]:
# Model with highest validation accuracy
best_c = max(acc_val, key=acc_val.get)

print("The best model has accuracy of %f with C = %f" % (acc_val[best_c], best_c))

I looks like C values in the neighbor of 0.88 yields models with the highest 
accuracies, 0.9312.

In [None]:
SVMs

In [None]:
fig = plt.figure(figsize=(14,7))

for c, (_, svs) in zip(SVMs.keys(), SVMs.items()):
    sv = sum(svs)
    plt.scatter(c, sv, s=sv*0.05)

plt.xlabel("c")
plt.ylabel("Support Vector Counts")
plt.title("Number of SV\'s for c in range [0,1]| " + "Linear SVM")
plt.savefig("lsvmNSV.jpg")
plt.show()