In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "SGDspeeddating"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [24]:
import pandas as pd

In [25]:
# for reading in the data from a csv into a pandas dataframe
def load_data():
    return pd.read_csv('../Dataset/speeddating.csv')

In [181]:
# read the data into speeddating
speeddating = load_data()

In [187]:
# check out the first 5 values
#speeddating.head()

In [183]:
# shows where null values exist and other cool info
speeddating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 42 columns):
gender                           8378 non-null int64
d_age                            8378 non-null int64
samerace                         8378 non-null int64
importance_same_race             8299 non-null float64
field                            8315 non-null object
pref_o_attractive                8289 non-null float64
pref_o_sincere                   8289 non-null float64
pref_o_intelligence              8289 non-null float64
pref_o_funny                     8280 non-null float64
pref_o_ambitious                 8271 non-null float64
pref_o_shared_interests          8249 non-null float64
attractive_o                     8166 non-null float64
sinsere_o                        8091 non-null float64
intelligence_o                   8072 non-null float64
funny_o                          8018 non-null float64
ambitous_o                       7656 non-null float64
shared_interests_o  

In [184]:
# plug in an feature name to see how often each possible feature value occurs
#speeddating[""].value_counts()

In [188]:
# some useful stats on the dataset
#speeddating.describe()

In [139]:
# make histograms for all the features
#speeddating.hist(bins=50, figsize=(20,15))
#plt.show()

In [138]:
# make a single histogram for a particular feature
#speeddating[""].hist(bins=50, figsize=(20,15))
#plt.show()

In [189]:
# loop by column 
#for col in speeddating:
#    print (col)

In [186]:
# Replace NULL values with the median
for col in speeddating:
    # field feature doesn't have a median - it's an object
    if(col != 'field'):
        median = speeddating[col].median()
        speeddating[col].fillna(median, inplace=True)
speeddating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 42 columns):
gender                           8378 non-null int64
d_age                            8378 non-null int64
samerace                         8378 non-null int64
importance_same_race             8378 non-null float64
field                            8315 non-null object
pref_o_attractive                8378 non-null float64
pref_o_sincere                   8378 non-null float64
pref_o_intelligence              8378 non-null float64
pref_o_funny                     8378 non-null float64
pref_o_ambitious                 8378 non-null float64
pref_o_shared_interests          8378 non-null float64
attractive_o                     8378 non-null float64
sinsere_o                        8378 non-null float64
intelligence_o                   8378 non-null float64
funny_o                          8378 non-null float64
ambitous_o                       8378 non-null float64
shared_interests_o  

In [None]:
# STUFF THAT WONT WORK (see trying to compute mean)

In [82]:
speeddating = np.genfromtxt('../Dataset/sd.csv', delimiter=',')

In [83]:
speeddating.shape

(8379, 40)

In [84]:
speeddating

array([[nan, nan, nan, ..., nan, nan, nan],
       [ 6.,  0.,  2., ...,  6.,  0.,  0.],
       [ 1.,  0.,  2., ...,  5.,  1.,  0.],
       ...,
       [ 4.,  0.,  1., ...,  5.,  0.,  0.],
       [ 3.,  0.,  1., ...,  5.,  0.,  0.],
       [ 3.,  0.,  1., ...,  5.,  0.,  0.]])

In [85]:
sd = speeddating[1:]

In [86]:
sd

array([[ 6.,  0.,  2., ...,  6.,  0.,  0.],
       [ 1.,  0.,  2., ...,  5.,  1.,  0.],
       [ 1.,  1.,  2., ..., nan,  1.,  1.],
       ...,
       [ 4.,  0.,  1., ...,  5.,  0.,  0.],
       [ 3.,  0.,  1., ...,  5.,  0.,  0.],
       [ 3.,  0.,  1., ...,  5.,  0.,  0.]])

In [87]:
arr = sd

In [88]:
for i in range(0,40):
    median = arr[...,i].mean()
    print(median)

4.1856051563619
0.3957985199331583
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
0.16471711625686322


In [20]:
mask = np.isnan(arr).any(axis=1)

In [21]:
arr = arr[~mask]

In [22]:
arr

array([[6., 0., 2., ..., 6., 0., 0.],
       [1., 0., 2., ..., 5., 1., 0.],
       [2., 0., 2., ..., 6., 0., 1.],
       ...,
       [1., 0., 4., ..., 1., 0., 0.],
       [2., 0., 4., ..., 1., 0., 0.],
       [1., 0., 4., ..., 1., 0., 0.]])

In [23]:
arr.shape

(1048, 40)