In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "SGDspeeddating"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
import pandas as pd

In [3]:
# for reading in the data from a csv into a pandas dataframe
def load_data():
    return pd.read_csv('../Dataset/speeddating.csv')

In [49]:
# read the data into speeddating
speeddating = load_data()

In [187]:
# check out the first 5 values
#speeddating.head()

In [50]:
# shows where null values exist and other cool info
speeddating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 42 columns):
gender                           8378 non-null int64
d_age                            8378 non-null int64
samerace                         8378 non-null int64
importance_same_race             8299 non-null float64
field                            8315 non-null object
pref_o_attractive                8289 non-null float64
pref_o_sincere                   8289 non-null float64
pref_o_intelligence              8289 non-null float64
pref_o_funny                     8280 non-null float64
pref_o_ambitious                 8271 non-null float64
pref_o_shared_interests          8249 non-null float64
attractive_o                     8166 non-null float64
sinsere_o                        8091 non-null float64
intelligence_o                   8072 non-null float64
funny_o                          8018 non-null float64
ambitous_o                       7656 non-null float64
shared_interests_o  

In [12]:
# plug in an feature name to see how often each possible feature value occurs
#speeddating["field"].value_counts()

In [63]:
# some useful stats on the dataset
#speeddating.describe()

In [17]:
# make histograms for all the features
#speeddating.hist(bins=50, figsize=(20,15))
#plt.show()

In [138]:
# make a single histogram for a particular feature
#speeddating[""].hist(bins=50, figsize=(20,15))
#plt.show()

In [51]:
# contains null values and since it is text, they cannot be replaced
speeddating = speeddating.drop('field', axis=1)

In [52]:
from sklearn.model_selection import train_test_split

sd_train, sd_test = train_test_split(speeddating, test_size=0.2, random_state=42)

In [189]:
# loop by column 
#for col in speeddating:
#    print (col)

In [9]:
# Replace NULL values with the median
#for col in speeddating:
#    # field feature doesn't have a median - it's an object
#    if(col != 'field'):
#        median = speeddating[col].median()
#        speeddating[col].fillna(median, inplace=True)
#speeddating.info()

In [53]:
corr_matrix = speeddating.corr()

In [54]:
corr_matrix["match"].sort_values(ascending=False)

match                            1.000000
like                             0.305723
funny_o                          0.277700
funny_partner                    0.277545
shared_interests_o               0.270840
shared_interests_partner         0.270679
attractive_o                     0.260837
attractive_partner               0.260676
guess_prob_liked                 0.255531
intelligence_partner             0.169532
intelligence_o                   0.169448
sinsere_o                        0.164523
sincere_partner                  0.164449
expected_num_interested_in_me    0.143755
ambition_partner                 0.140042
ambitous_o                       0.139927
expected_num_matches             0.130592
met                              0.100427
intelligence                     0.051066
funny_important                  0.041652
pref_o_funny                     0.041483
attractive                       0.036439
interests_correlate              0.031121
expected_happy_with_sd_people    0

In [55]:
# create train and test data and targets/labels (not cleaned)
sd_train_x = sd_train.drop('match', axis=1)
sd_train_y = sd_train['match'].copy()
sd_test_x = sd_test.drop('match', axis=1)
sd_test_y = sd_test['match'].copy()

In [10]:
# use sklearn's imputer to replace missing values
# this will store the median of each feature in its
# statistics_ variable
from sklearn.impute import SimpleImputer

In [27]:
# using just the imputer
#imputer = SimpleImputer(strategy='median')

In [56]:
#imputer.fit(sd_train_x)

SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)

In [57]:
#imputer.statistics_

array([ 0.   ,  3.   ,  0.   ,  3.   , 20.   , 18.37 , 20.   , 18.   ,
       10.   , 10.585,  6.   ,  7.   ,  7.   ,  7.   ,  7.   ,  6.   ,
       20.   , 18.18 , 20.   , 18.   , 10.   , 11.   ,  7.   ,  8.   ,
        8.   ,  8.   ,  8.   ,  6.   ,  7.   ,  7.   ,  7.   ,  7.   ,
        6.   ,  0.21 ,  6.   ,  4.   ,  3.   ,  6.   ,  5.   ,  0.   ])

In [58]:
#sd_train_x.median().values

array([ 0.   ,  3.   ,  0.   ,  3.   , 20.   , 18.37 , 20.   , 18.   ,
       10.   , 10.585,  6.   ,  7.   ,  7.   ,  7.   ,  7.   ,  6.   ,
       20.   , 18.18 , 20.   , 18.   , 10.   , 11.   ,  7.   ,  8.   ,
        8.   ,  8.   ,  8.   ,  6.   ,  7.   ,  7.   ,  7.   ,  7.   ,
        6.   ,  0.21 ,  6.   ,  4.   ,  3.   ,  6.   ,  5.   ,  0.   ])

In [59]:
# makes a plain numpy array
#X = imputer.transform(sd_train_x)

In [75]:
# put it back into a pandas dataframe (if you want)
#X_train = pd.DataFrame(X, columns=sd_train_x.columns)

In [74]:
#X_train.info()

In [64]:
# for creating a pipeline to pass data through in a a specific order
# data -> imputer -> std_scaler -> cleaned and standardized data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

In [71]:
X_train_sd = pipeline.fit_transform(sd_train_x)
X_test_sd = pipeline.fit_transform(sd_test_x)

In [72]:
X_train_sd.shape

(6702, 40)

In [73]:
X_test_sd.shape

(1676, 40)