# Debate Exploration Using Natural Language Processing
## Bag of Words Model
### Warning: this notebook takes a long time to run

In [4]:
import numpy as np
import pandas as pd
import sklearn
import string
import random

from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt

In [1]:
def remove_punctuation(str):
    for char in str:
        if char in string.punctuation:
            str = str.replace(char, '')
    return str

def add_to_dict(str, dict):
    for word in str.split():
        if not word.lower() in dict:
            dict[word.lower()] = len(dict)
    return dict

def extract_dictionary(df):
    """
    Reads a panda dataframe, and returns a dictionary of distinct words
    mapping from each distinct word to its index (ordered by when it was found).
    Input:
        df: dataframe/output of load_data()
    Returns:
        a dictionary of distinct words that maps each distinct word
        to a unique index corresponding to when it was first found while
        iterating over all words in each speech in the dataframe df
    """
    df.text = df.text.apply(remove_punctuation)
    word_dict = {}
    df.apply(lambda x: add_to_dict(x.text, word_dict), axis = 1)
    print("Number of unique words: " + str(len(word_dict)))
    return word_dict

In [2]:
def add_columns_to_feature_matrix(features, matrix):
    matrix = np.append(matrix, np.column_stack(features).T, axis = 1)
    return matrix

def generate_feature_matrix(df, word_dict, scale = 'None'):
    """
    Reads a dataframe and the dictionary of unique words
    to generate a matrix of {1, 0} feature vectors for each speech.
    Use the word_dict to find the correct index to set to 1 for each place
    in the feature vector. The resulting feature matrix should be of
    dimension (number of speeches, number of words).
    Input:
        df: dataframe that has the ratings and labels
        word_list: dictionary of words mapping to indices
    Returns:
        a feature matrix of dimension (number of reviews, number of words)
    """
    number_of_speeches = df.shape[0]
    number_of_words = len(word_dict)
    feature_matrix = np.zeros((number_of_speeches, number_of_words))
    for i in range(0, number_of_speeches):
        for word in word_dict:
            if word in str(df.text.iloc[i]).lower().split():
                feature_matrix[i, word_dict[word]] = 1

    print("Average number of non-zero features: " + str(np.mean(np.sum(feature_matrix, axis = 1))))
    if(scale == "None"):
        return feature_matrix
    elif(scale == "Normalize"):
        return feature_matrix/np.linalg.norm(feature_matrix, axis = 1)[:,None]
    elif(scale == "Scale"):
        return feature_matrix*np.linalg.norm(feature_matrix, axis = 1)[:,None]

In [3]:
def get_data(candidate, dataframe, dictionary):
    positiveDF = dataframe[dataframe.name == candidate].copy()
    negativeDF = dataframe[dataframe.name != candidate].copy()
    positive_train = positiveDF.sample(frac = 0.75)
    positive_test = pd.concat([positiveDF, positive_train]).drop_duplicates(keep=False)
    negative_train = negativeDF.sample(frac = (len(positiveDF)/len(negativeDF))*0.75)
    negative_test = pd.concat([negativeDF, negative_train]).drop_duplicates(keep=False).sample(frac = (len(positiveDF)/len(negativeDF))*0.25)
    X_train = pd.concat([positive_train, negative_train]).reset_index(drop=True).copy()
    X_test = pd.concat([positive_test,
                        negative_test]).reset_index(drop=True).copy()
    Y_train = X_train.name == candidate
    Y_test = X_test.name == candidate
    print("Getting training feature matrix... ")
    X_train = generate_feature_matrix(X_train, dictionary)
    print("Getting testing feature matrix... ")
    X_test = generate_feature_matrix(X_test, dictionary)
    return (X_train, Y_train, X_test, Y_test)

In [5]:
def performance(y_true, y_pred, metric="accuracy"):
    """
    Calculates the performance metric as evaluated on the true labels
    y_true versus the predicted labels y_pred.
    Input:
        y_true: (n,) array containing known labels
        y_pred: (n,) array containing predicted scores
        metric: string specifying the performance metric (default='accuracy'
                 other options: 'f1-score', 'auroc', 'precision', 'sensitivity',
                 and 'specificity')
    Returns:
        the performance as an np.float64
    """
    tp = np.logical_and(y_true == 1, y_pred == 1).sum()
    tn = np.logical_and(y_true == 0, y_pred == 0).sum()
    fp = np.logical_and(y_true != 1, y_pred == 1).sum()
    fn = np.logical_and(y_true != 0, y_pred == 0).sum() 
    return np.float64((tp + tn)/(tp + tn + fp + fn))

In [6]:
def cv(clf, X, y, k=5, metric="accuracy"):
    """
    Splits the data X and the labels y into k-folds and runs k-fold
    cross-validation: for each fold i in 1...k, trains a classifier on
    all the data except the ith fold, and tests on the ith fold.
    Calculates the k-fold cross-validation performance metric for classifier
    clf by averaging the performance across folds.
    Input:
        clf: an instance of SVC()
        X: (n,d) array of feature vectors, where n is the number of examples
           and d is the number of features
        y: (n,) array of binary labels {1,-1}
        k: an int specifying the number of folds (default=5)
        metric: string specifying the performance metric (default='accuracy'
             other options: 'f1-score', 'auroc', 'precision', 'sensitivity',
             and 'specificity')
    Returns:
        average 'test' performance across the k folds as np.float64
    """
    scores = []
    skf = StratifiedKFold(n_splits = k, shuffle = False)
    skf.get_n_splits(X, y)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        scores.append(performance(y_test, pred, metric))
    return np.array(scores).mean()

In [7]:
def select_classifier(penalty='l2', c=1.0, degree=1, r=0.0, class_weight='balanced', multi_class = 'ovr'):
    """
    Return a linear svm classifier based on the given
    penalty function and regularization parameter c.
    """
    if penalty == 'l2':
        return SVC(C = c, kernel = 'linear', coef0 = r, degree = degree, class_weight = class_weight)
    else:
        return LinearSVC(penalty = penalty, C = c, dual = False, max_iter = 100000, multi_class = 'ovr')

In [10]:
def select_param_linear(X, y, k=5, metric="accuracy", C_range = [], penalty='l2', class_weight = 'balanced'):
    """
    Sweeps different settings for the hyperparameter of a linear-kernel SVM,
    calculating the k-fold CV performance for each setting on X, y.
    Input:
        X: (n,d) array of feature vectors, where n is the number of examples
        and d is the number of features
        y: (n,) array of binary labels {1,-1}
        k: int specifying the number of folds (default=5)
        metric: string specifying the performance metric (default='accuracy',
             other options: 'f1-score', 'auroc', 'precision', 'sensitivity',
             and 'specificity')
        C_range: an array with C values to be searched over
    Returns:
        The parameter value for a linear-kernel SVM that maximizes the
        average 5-fold CV performance.
    """
    best_score = 0
    best_c = C_range[0]
    print("Getting best C parameter... ")
    for C in C_range:
        clf = select_classifier(penalty = penalty, c = C, class_weight = class_weight)
        score = cv(clf, X, y, k, metric)
        if score > best_score:
            best_score = score
            best_c = C
    
    return (best_c, best_score)

## Load Data Frame and Dictionary

In [8]:
fname = "../data/transcripts_cleaned.csv"
debates = pd.read_csv(fname)
debates = debates[debates.name != 'Non-candidate']
print("Getting dictionary... ")
dictionary = extract_dictionary(debates)

Getting dictionary... 
Number of unique words: 8937


## Selecting Parameters
### Train a model on each candidate's speech and return the penalty coefficient that leads to the best predictions
You don't need to run this if you'd rather just use my chosen parameter of $C = 0.1$. 

In [9]:
candidates = ['Biden', 'Buttigieg', 'Klobuchar', 'Sanders', 'Warren']
"""
for candidate in candidates:
    print(candidate + ": ")
    X_train, Y_train, X_test, Y_test, dictionary = get_data(candidate)
    C_range = [10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]
    print(select_param_linear(X_train, Y_train, k = 5, C_range = C_range, penalty = 'l2'))
"""

'\nfor candidate in candidates:\n    print(candidate + ": ")\n    X_train, Y_train, X_test, Y_test, dictionary = get_data(candidate)\n    C_range = [10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]\n    print(select_param_linear(X_train, Y_train, k = 5, C_range = C_range, penalty = \'l2\'))\n'

## Training Models
### Using the majority-vote best C-parameter, train models on each candidate and get words with high coefficients

In [11]:
words = {'Biden': [], 'Buttigieg': [], 'Klobuchar': [], 'Sanders': [], 'Warren': []}
accuracies = {'Biden': [], 'Buttigieg': [], 'Klobuchar': [], 'Sanders': [], 'Warren': []}
coefs = {'Biden': [], 'Buttigieg': [], 'Klobuchar': [], 'Sanders': [], 'Warren': []}
k = 30
for i in range(k):
    print('Fold ' + str(i) + ': ')
    for candidate in candidates:
        print(candidate + ': ')
        X_train, Y_train, X_test, Y_test = get_data(candidate, debates, dictionary)
        clf = select_classifier(c = 0.1)
        clf.fit(X_train, Y_train)
        pred = clf.predict(X_test)
        accuracies[candidate].append(performance(Y_test, pred))
        largest = pd.DataFrame(clf.coef_.T).nlargest(10, columns = 0)
        smallest = pd.DataFrame(clf.coef_.T).nsmallest(10, columns = 0)
        coefs[candidate].append(clf.coef_)
        for index in range(largest.shape[0]):
            words[candidate].append(list(dictionary)[largest.iloc[index].name])

raining feature matrix... 
Average number of non-zero features: 51.06310679611651
Getting testing feature matrix... 
Average number of non-zero features: 45.5
Klobuchar: 
Getting training feature matrix... 
Average number of non-zero features: 50.200542005420054
Getting testing feature matrix... 
Average number of non-zero features: 49.739495798319325
Sanders: 
Getting training feature matrix... 
Average number of non-zero features: 46.3
Getting testing feature matrix... 
Average number of non-zero features: 39.76923076923077
Warren: 
Getting training feature matrix... 
Average number of non-zero features: 47.19665271966527
Getting testing feature matrix... 
Average number of non-zero features: 41.58552631578947
Fold 10: 
Biden: 
Getting training feature matrix... 
Average number of non-zero features: 45.88740458015267
Getting testing feature matrix... 
Average number of non-zero features: 46.61212121212121
Buttigieg: 
Getting training feature matrix... 
Average number of non-zero feat

## Comparing Candidates
### See which candidates are most distinct from every other candidate and which candidates are most and least similar to each other

In [12]:
for candidate in candidates:
    print(candidate + ' prediction accuracy: ' + str(np.mean(accuracies[candidate])))

Biden prediction accuracy: 0.7567676767676769
Buttigieg prediction accuracy: 0.7159090909090907
Klobuchar prediction accuracy: 0.7011204481792718
Sanders prediction accuracy: 0.7496503496503497
Warren prediction accuracy: 0.7293859649122806


In [13]:
alignments = np.zeros((5, 5))
avg_coefs = {}
for candidate in candidates:
    avg_coefs[candidate] = np.mean(np.array(coefs[candidate]), axis = 0)[0]
for i, candidate1 in enumerate(candidates):
    for j, candidate2 in enumerate(candidates):
        alignments[i, j] = np.dot(avg_coefs[candidate1], avg_coefs[candidate2].T)

alignments = pd.DataFrame(alignments)
alignments.index = candidates
alignments.columns = candidates
alignments

Unnamed: 0,Biden,Buttigieg,Klobuchar,Sanders,Warren
Biden,6.676468,-0.67397,-0.918452,-0.277564,-0.695145
Buttigieg,-0.67397,5.851765,-0.43959,-0.861142,-0.870195
Klobuchar,-0.918452,-0.43959,4.755316,-0.328117,-0.31409
Sanders,-0.277564,-0.861142,-0.328117,5.427345,-0.695771
Warren,-0.695145,-0.870195,-0.31409,-0.695771,6.448461


In [14]:
for candidate in candidates:
    most = str(alignments[candidate][alignments[candidate] == max(alignments[alignments[candidate].index != candidate][candidate])].index[0])
    least = str(alignments[candidate][alignments[candidate] == min(alignments[alignments[candidate].index != candidate][candidate])].index[0])
    print(candidate + ' is most similar to ' + most)
    print(candidate + ' is least similar to ' + least)

Biden is most similar to Sanders
Biden is least similar to Klobuchar
Buttigieg is most similar to Klobuchar
Buttigieg is least similar to Warren
Klobuchar is most similar to Warren
Klobuchar is least similar to Biden
Sanders is most similar to Biden
Sanders is least similar to Buttigieg
Warren is most similar to Klobuchar
Warren is least similar to Buttigieg


In [15]:
f = open('../candidate_word_weights.txt', 'w')
for candidate in candidates:
    f.write(candidate + ': \n')
    for i, word in enumerate(words[candidate]):
        f.write(word + ': ' + str(avg_coefs[candidate][dictionary[words[candidate][i]]]) + '\n')
    f.write('\n')
f.close()