In [None]:
from sklearn import svm
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from more_itertools import powerset
plt.rcParams.update({'lines.markersize': 30, 'font.size': 12})

NUM_VALIDATION_GROUPS = 5
GROUND_TRUTH = 'Survived'
ALL_FEATURES = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']
SELECTED_FEATURES = ['Pclass', 'Sex', 'Embarked']
SELECTED_FEATURES_2 = ['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Fare', 'Ticket'] # Ticket makes it slow
DEBUG = False

In [None]:
NOTEBOOK_MODE = True

In [None]:
def task_2():
    domain =     (-1, 1)
    positive =   (1, -1)
    negative =   (-1, 1)

    plot_bounds = (-1.5, 1.5)
    plt.figure(figsize=(10,5))

    # Display the plot normally in x1, x2
    plt.subplot(1,2,1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    
    plt.scatter(domain, positive, marker='+')
    plt.scatter(domain, negative, marker='_')
    
    plt.title('Before Kernel')
    plt.xlabel('x1')
    plt.ylabel('x2')
    
    plt.xlim(plot_bounds)
    plt.ylim(plot_bounds)
        
    plt.xticks((-1, 1))
    plt.yticks((-1, 1))
    
    
    # Display the plot mapped to x1, x1x2
    plt.subplot(1,2,2)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    
    positive = ( domain[0]*positive[0], domain[1]*positive[1] )
    negative = ( domain[0]*negative[0], domain[1]*negative[1] )
    
    plt.scatter(domain, positive, marker='+')
    plt.scatter(domain, negative, marker='_')
    
    plt.title('After Kernel (with separator)')
    plt.xlabel('x1')
    plt.ylabel('x1x2')
    
    plt.xlim(plot_bounds)
    plt.ylim(plot_bounds)
        
    plt.xticks((-1, 1))
    plt.yticks((-1, 1))
    
    
    # Add the maximal margin separator
    plt.plot((-2, 2), (0,0), linestyle='dashed', color='black')
    
    plt.show()
    
if NOTEBOOK_MODE:
    task_2()

In [None]:
def task_5():
    data = {
        'x1': [1,2,2,0,1,0],
        'x2': [1,2,0,0,0,1],
        'class': list('+++---')
    }
    
    df = pd.DataFrame(data)
    
    positive = df[df['class'] == '+']
    negative = df[df['class'] == '-']
    
    plt.scatter(positive['x1'], positive['x2'], marker='+')
    plt.scatter(negative['x1'], negative['x2'], marker='_')
    
    plt.plot((-1, 3), (3, -1), linestyle='dashed', color='gray')
    plt.plot((-1, 3), (2.5, -1.5), linestyle='dashed', color='black')
    plt.plot((-1, 3), (2, -2), linestyle='dashed', color='gray')
    
    #plt.figure(figsize=(10,10))
    plt.axis('square')
    
    plot_bounds = (-0.5, 2.5)
    plt.xlim(plot_bounds)
    plt.ylim(plot_bounds)
    
    plt.xlabel('x1')
    plt.ylabel('x2')
    
    plt.show()
    
if NOTEBOOK_MODE:
    task_5()

In [None]:
def task_6():
    plt.scatter(0, 0, marker='+')
    plt.scatter((1.41, -1.41), (1,1), marker='_')
    plt.plot((-3,3), (0.5, 0.5), linestyle='dashed', color='black')
    
    plt.axis('square')
    
    plt.xlim((-2,2))
    plt.ylim((-0.5, 1.5))
    
    plt.xlabel('sqrt(2) * x')
    plt.ylabel('x^2')
    
    plt.show()
    
if NOTEBOOK_MODE:
    task_6()

In [None]:
# Copied from HW2
def titanic_preprocess(file):
    df = pd.read_csv(file)
    del df['Name']
    del df['Cabin']
    tickets = list(df['Ticket'])
    regex = re.compile('\d+$')
    tickets = [regex.search(ticket) for ticket in tickets]
    tickets = [int(ticket.group(0)) if ticket is not None else 0 for ticket in tickets]
    df['Ticket'] = tickets
    df['Sex'] = [0 if sex == 'male' else 1 for sex in list(df['Sex'])]
    df['Embarked'] = [2 if port == 'C' else 1 if port == 'Q' else 0 for port in list(df['Embarked'])]
    non_null_ages = df['Age'][~df['Age'].isnull()]
    average_age = round(np.average(non_null_ages), 2)
    df.loc[df['Age'].isnull(), 'Age'] = average_age
    return df


# Adapted from HW2
def k_fold_validation(df, features, kernel):
    total_rows = df.shape[0]
    split_size = total_rows // NUM_VALIDATION_GROUPS

    scores = []

    for i in tqdm(range(NUM_VALIDATION_GROUPS), desc=f'Validating {kernel}', disable=True):
        train_rows = np.r_[i * split_size : (i+1) * split_size]
        val_rows = list(set(np.r_[0:total_rows]) - set(train_rows))
        train = df.loc[train_rows]
        val = df.loc[val_rows]
        
        my_svm = train_svm(train, features, kernel)
        score = score_svm(val, my_svm, features)

        scores.append(score)

    return np.mean(scores)

# Adapted from HW2
def train_svm(df, features, kernel):
    x = df[features]
    y = df[GROUND_TRUTH]
    
    debug(f'Training {kernel}')
    
    if kernel == 'linear':
        my_svm = svm.SVC(kernel=kernel, C=1.0).fit(x,y)
    elif kernel == 'poly':
        my_svm = svm.SVC(kernel=kernel, degree=2, C=1.0).fit(x,y)
    elif kernel == 'rbf':
        my_svm = svm.SVC(kernel=kernel, gamma=0.7, C=1.0).fit(x,y)
    
    debug(f'Training complete')
    
    return my_svm


# Adapted from HW2
def score_svm(df, my_svm, features):
    x = df[features]
    y = df[GROUND_TRUTH]
    
    debug(f'Scoring')
    
    score = my_svm.score(x,y) 
    
    debug(f'Scoring complete')
    
    return score


def score_all_feature_combinations(df, method):
    all_combinations = set(powerset(SELECTED_FEATURES_2))

    combination_with_score = []

    for combination in tqdm(all_combinations, desc="Finding Best Feature Combination"):
        if len(combination) == 0:
            continue

        combination = list(combination)

        score = k_fold_validation(df, combination, method)

        combination_with_score.append((combination, score))
        
    combination_with_score.sort(key=lambda x: x[1], reverse=True)

    return combination_with_score


def debug(*args, **kwargs):
    if DEBUG:
        print(args, kwargs)


def task_7():
    train = titanic_preprocess('titanic/train.csv')
    test = titanic_preprocess('titanic/test.csv')
    
    for kernel in ('rbf', 'poly', 'linear'):
        combos = score_all_feature_combinations(train, kernel)
        print(f'Best 5 combos for {kernel}:')
        for combo, score in combos[:5]:
            print(f'    {score:.3}: {", ".join(combo)}')
    
    

if NOTEBOOK_MODE:
    task_7()
    print('done')

In [None]:
print('hello')