In [1]:
import math
from copy import deepcopy
from collections import Counter
from io import StringIO
from operator import itemgetter, attrgetter

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

import pydot_ng as pydot
from IPython.display import Image
from tabulate import tabulate

# Data Processing

## Load Data

In [2]:
df = pd.read_csv('./speed-dating-experiment/Speed Dating Data.csv', encoding="ISO-8859-1")

## Analyze Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [4]:
# Unparseable columns

unparseable_cols = df.select_dtypes(include=['object']).columns
unparseable_cols

Index(['field', 'undergra', 'mn_sat', 'tuition', 'from', 'zipcode', 'income',
       'career'],
      dtype='object')

In [5]:
# Number of null value of each columns

cols_to_null_nums = df.isnull().sum(axis=0).sort_values(ascending=False)
cols_to_null_nums[:10]

num_in_3    7710
numdat_3    6882
expnum      6578
sinc7_2     6423
amb7_2      6423
shar7_2     6404
intel7_2    6394
fun7_2      6394
attr7_2     6394
amb5_3      6362
dtype: int64

In [6]:
# Number of null value of each row

rows_to_null_nums = df.isnull().sum(axis=1).sort_values(ascending=False)
rows_to_null_nums[:10]

842    172
847    172
845    172
839    172
843    172
838    171
324    158
841    155
846    154
840    154
dtype: int64

In [7]:
df.describe()



Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
count,8378.0,8377.0,8378.0,8378.0,8378.0,8378.0,8378.0,8378.0,6532.0,8378.0,...,3974.0,3974.0,3974.0,3974.0,3974.0,2016.0,2016.0,2016.0,2016.0,2016.0
mean,283.675937,8.960248,0.500597,17.327166,1.828837,11.350919,16.872046,9.042731,9.295775,8.927668,...,7.240312,8.093357,8.388777,7.658782,7.391545,6.81002,7.615079,7.93254,7.155258,7.048611
std,158.583367,5.491329,0.500029,10.940735,0.376673,5.995903,4.358458,5.514939,5.650199,5.477009,...,1.576596,1.610309,1.459094,1.74467,1.961417,1.507341,1.504551,1.340868,1.672787,1.717988
min,1.0,1.0,0.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,...,2.0,2.0,3.0,2.0,1.0,2.0,2.0,4.0,1.0,1.0
25%,154.0,,0.0,8.0,2.0,7.0,14.0,4.0,,4.0,...,,,,,,,,,,
50%,281.0,,1.0,16.0,2.0,11.0,18.0,8.0,,8.0,...,,,,,,,,,,
75%,407.0,,1.0,26.0,2.0,15.0,20.0,13.0,,13.0,...,,,,,,,,,,
max,552.0,22.0,1.0,44.0,2.0,21.0,22.0,22.0,22.0,22.0,...,12.0,12.0,12.0,12.0,12.0,10.0,10.0,10.0,10.0,10.0


## Data Cleaning

In [8]:
# Clean Comma
COLUMNS_WITH_COMMA = ['income', 'zipcode', 'tuition', 'mn_sat']

for col in COLUMNS_WITH_COMMA:
    df[col] = df[col].replace('[,]', '', regex=True).astype(float)

### Handle String Data

In [9]:
ENCODED_STRING_COLUMNS = ['career', 'field']
df.drop(ENCODED_STRING_COLUMNS, axis=1, inplace=True)

In [10]:
OTHER_STRING_COLUMNS = ['undergra', 'from']

# Joined df based on partner
joined_df = df.merge(
    df,
    how='left',
    left_on=['iid', 'id', 'pid', 'partner'],
    right_on=['pid', 'partner', 'iid', 'id']
)

In [11]:
df['same_from'] = joined_df['from_x'] == joined_df['from_y']

In [12]:
df['same_undergra'] = joined_df['undergra_x'] == joined_df['undergra_y']

In [13]:
df.drop(OTHER_STRING_COLUMNS, axis=1, inplace=True)

---

# Method Implementation

## KNN

### Self Implement

In [14]:
def eucldn_similarity(v1, v2):
    """
    Euclidean Distance。越相似，距離越近，相似度數值會越小。
    :param v1:
    :param v2:
    :return: 1/distance(v1,v2) 取距離的倒數，越大越像
    """
    sum_xx, sum_xy, sum_yy = 0.0, 0.0, 0.0
    for i in range(0, len(v1)):
        sum_xx += math.pow(v1[i], 2)
        sum_xy += v1[i] * v2[i]
        sum_yy += math.pow(v2[i], 2)

    return 1/math.sqrt(sum_xx + sum_yy - 2*sum_xy)

In [15]:
def cos_similarity(v1, v2):
    """
    Cosine（兩向量的餘弦）。越相似，夾角越小，相似度數值會越高。
    :param v1:
    :param v2:
    :return: cos(v1,v2)介於1到-1之間 越大越像
    """
    sum_xx, sum_xy, sum_yy = 0.0, 0.0, 0.0
    for i in range(0, len(v1)):
        sum_xx += math.pow(v1[i], 2)
        sum_xy += v1[i] * v2[i]
        sum_yy += math.pow(v2[i], 2)
    return sum_xy / math.sqrt(sum_xx * sum_yy)

In [16]:
def prediction(nb_list):
    cnt = Counter(nb_list)
    
    for tp in nb_list:
        cnt[tp[-1]] += 1
    label = cnt.most_common(1)[0][0]
    return label

In [17]:
def knn_classify(test_X, train_X, train_y, k=5):
    nb = []
    for feature, label in zip(train_X, train_y):
        nb.append((cos_similarity(test_X,feature),tuple(label)))
    nb.sort(reverse=True)
    label = prediction(nb[:k])
    return label

In [18]:
def knn(train_X, test_X, train_y, test_y, n_neighbors=5):
    count = 0
    for feature, label in zip(test_X, test_y):
        pred_y = knn_classify(feature, train_X, train_y, n_neighbors)
        if pred_y == tuple(label):
            count += 1
    accuracy = count / len(test_X)
    return accuracy

## Sklearn

In [19]:
def knn_with_sklearn(train_X, test_X, train_y, test_y,
                     n_neighbors=5):
    clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(train_X, train_y)
    pred = clf.predict(test_X)
    return accuracy_score(pred, test_y)

# Decision Tree

## Self Implement

In [20]:
def try_split(train_X, index, value):
    left, right = list(), list()
    for tp in train_X:
        if tp[index] < value:
            left.append(tp)
        else:
            right.append(tp)
    return left, right

In [21]:
def gini():
    gini = 0.0
    return gini

In [22]:
def gini_index():
    gini = 0.0
    return gini

In [23]:
def determine_split(tr):
    a = 0
    return a

In [24]:
def train_dt(train_X, train_y, min_leaf):
    a = 0
    return a 

In [25]:
def dt(train_X, test_X, train_y, test_y, min_samples_leaf=5):
    accuracy = 0
    l, r = try_split(train_X, 0, 60)
    return len(l),len(r)

In [26]:
def train_test(df):
    data = split_df(df)
    print('DT Accurary: ', dt(*data))

In [27]:
#baseline_df = deepcopy(df)
#train_test(baseline_df)

## Sklearn

In [28]:
def dt_with_sklearn(train_X, test_X, train_y, test_y,
                    min_samples_leaf=5):
    clf = tree.DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    clf.fit(train_X, train_y)
    pred = clf.predict(test_X)
    return accuracy_score(pred, test_y)

In [29]:
# Visualize Decision Tree

# dot_data = StringIO()
# tree.export_graphviz(clf, out_file=dot_data,
#                      feature_names=df_X.columns, class_names=True)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

## Split Data Method

In [30]:
LABELS = ['dec', 'dec_o']
REDUDANT_COLUMNS = ['match']


def split_df(df,
             default_na=0, test_size=0.2, random_state=1):
    df = df.fillna(default_na).drop(REDUDANT_COLUMNS, axis=1)
    
    df_X = df.drop(LABELS, axis=1)
    df_y = df[LABELS]
    
    X, y = np.array(df_X), np.array(df_y)
    return train_test_split(X, y,
                            test_size=test_size, random_state=random_state)    

## Training Method

In [31]:
def train_with_all_methods(df):
    data = split_df(df)
    print('Sklearn KNN Accurary: ', knn_with_sklearn(*data))
    print('Sklearn DT Accurary: ', dt_with_sklearn(*data))
    print('KNN Accurary: ', knn(*data))
    #print('DT Accurary: ', dt(*data))

---

# Baseline

In [None]:
baseline_df = deepcopy(df)

## Data Processing

## Training

In [None]:
train_with_all_methods(baseline_df)

Sklearn KNN Accurary:  0.433770883055
Sklearn DT Accurary:  0.507159904535


---

# Drop columns or rows with too many null

In [None]:
df_drop_nan = deepcopy(df)

## Data Processing

In [None]:
# Drop Rows

ROW_THREASHOLD = 100

df_drop_nan.dropna(axis=0, thresh=ROW_THREASHOLD, inplace=True);

In [None]:
# Drop Cols

COL_THREASHOLD = 4000

df_drop_nan.dropna(axis=1, thresh=COL_THREASHOLD, inplace=True);

## Training

In [None]:
train_with_all_methods(df_drop_nan)

---

# Hobby Similarity

In [None]:
df_hs = deepcopy(df)

## Data Processing

In [None]:
HOBBY_COLUMNS = [
    'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art',
    'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater',
    'movies', 'concerts', 'music', 'shopping', 'yoga'
]

similiar_thres = 1

In [None]:
def count_hobby_similarity(joined_df,
                           similiar_thres=1,
                           na_default=-1):
    joined_df_no_na = joined_df.fillna(na_default)
    ss = list()
    for hobby in HOBBY_COLUMNS:
        temp_s = abs(joined_df_no_na[hobby+'_x'] - joined_df_no_na[hobby+'_y'])
        temp_s = (temp_s <= similiar_thres).astype(int)
        ss.append(temp_s)
    return sum(ss)

In [None]:
df_hs['sim_hob_num'] = count_hobby_similarity(joined_df, similiar_thres=1)

In [None]:
df_hs.drop(HOBBY_COLUMNS, axis=1, inplace=True)

## Training

In [None]:
train_with_all_methods(df_hs)