In [125]:
# UMich EECS 445 Project 1

In [126]:
import pandas as pd
import numpy as np
import itertools

import string as s
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn import metrics
from matplotlib import pyplot as plt;
from sklearn.metrics import confusion_matrix

In [127]:
# load_data

def load_data(fname):
  """
  Reads in a csv file and return a dataframe. A dataframe df is similar to dictionary.
  You can access the label by calling df['label'], the content by df['content']
  the sentiment by df['sentiment']
  """
  return pd.read_csv(fname)

data = load_data('dataset.csv')
#data['label'], data['content']
len(data)

500

In [128]:
def extract_dictionary(df):
    """
    Reads a panda dataframe, and returns a dictionary of distinct words
        mapping from the distinct word to its index (ordered by when it was found).
    Input:
        dataframe/output of load_data()        
    Returns: 
        a dictionary of distinct words
        mapping from the distinct word to its index (ordered by when it was found).
    """
    transtable = str.maketrans(s.punctuation, ' '*len(s.punctuation))
    word_dict = {}
    index = -1
    for i in range(len(data)):
        post = data['content'][i]
        post = post.translate(transtable)
        post = post.split()
        for word in post:
            word = word.lower()
            if word not in word_dict:
                index += 1
                word_dict[word] = index
                
    return word_dict
    
word_dict = extract_dictionary(data)
len(word_dict)

2201

In [129]:
def generate_feature_matrix(df, word_dict):
    """
    Reads a dataframe and the dictionary of words in the reviews
        to generate {1, 0} feature vectors for each review. The resulting feature
        matrix should be of dimension (number of tweets, number of words).
    Input:
        df - dataframe that has the tweets and labels
        word_list- dictionary of words mapping to indices
    Returns: 
        a feature matrix of dimension (number of tweets, number of words)
    """
    fmatrix = np.zeros((len(df), len(word_dict)))
    transtable = str.maketrans(s.punctuation, ' '*len(s.punctuation))
    for i in range(len(data)):
        post = data['content'][i]
        post = post.translate(transtable)
        post = post.split()
        for word in post:
            word = word.lower()
            if word in word_dict:
                fmatrix[i][word_dict[word]] = 1
                
    return fmatrix

fmatrix = generate_feature_matrix(data, word_dict)
fmatrix.shape

(500, 2201)

In [130]:
def generate_label(df):
    """
    Input:
        dataFrame by load_data
    Output:
        numpy.array with length 500. 1 for love, 0 for hate
    """
    label = np.zeros((len(df)))
    for i in range(len(df)):
        if data['sentiment'][i] == 'love':
            label[i] = 1
            
    return label
        
label = generate_label(data)

In [131]:
def cv_performance(clf, X, y, k=5, metric="accuracy"):
    """
    Splits the data, X and y, into k-folds and runs k-fold crossvalidation:
        training a classifier on K-1 folds and testing on the remaining fold.
        Calculates the k-fold crossvalidation performance metric for classifier
        clf by averaging the performance across folds.
    Input:
        clf - an instance of SVC()
        X - (n,d) array of feature vectors, where n is the number of examples
           and d is the number of features
        y - (n,) array of binary labels {1,-1}
        k - int specificyin the number of folds (default=5)
        metric - string specifying the performance metric (default='accuracy',
            other options: 'f1-score', 'auroc', 'precision', 'sensitivity',
            and 'specificity')
    Returns: average 'test' performance across the k folds as np.float64
    """
    # todo, only accuracy and no CV now
    skf = StratifiedKFold(n_splits=k)
    avg_accu = 0
    for train_index, test_index in skf.split(X, y):
        clf.fit(X[train_index], y[train_index])
        predict = clf.predict(X[test_index])
        accu = (1 - np.abs(y[test_index] - predict)).sum() / len(test_index)
        avg_accu += accu
    
    return avg_accu / k
    
cv_performance(SVC(kernel='linear'), fmatrix, label)

0.71201720172017191

In [132]:
def select_param_linear(X, y, k=5, metric="accuracy", C_range = [], penalty='l2'):
    """
    Sweeps different settings for the hyperparameter of a linear-kernel SVM,
        calculating the k-fold CV performance for each setting on X, y.
    Input:
        X - (n,d) array of feature vectors, where n is the number of examples
            and d is the number of features
        y - (n,) array of binary labels {1,-1}
        k - int specifying the number of folds (default=5)
            metric- string specifying the performance metric (default='accuracy',
            other options: 'f1-score', 'auroc', 'precision', 'sensitivity',
            and 'specificity')
        C_range - an array with all C values to be checked for
    Returns the parameter value for linear-kernel SVM, that 'maximizes' the
        average 5-fold CV performance.
    """
    max_perf = 0
    final_c = 0
    for c in C_range:
        perf = cv_performance(SVC(kernel='linear', C=c, class_weight='balanced'), X, y, k, metric)
        if perf > max_perf:
            max_perf = perf
            final_c = c
            
    return final_c

C_range = np.array([10**(-3), 1, 10**(3)])
select_param_linear(fmatrix, label, C_range=C_range)

1000.0