In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "SGDspeeddating"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
import pandas as pd

In [3]:
# for reading in the data from a csv into a pandas dataframe
def load_data():
    return pd.read_csv('../Dataset/speeddating.csv')

In [4]:
# read the data into speeddating
speeddating = load_data()

In [5]:
# check out the first 5 values
#speeddating.head()

In [6]:
# shows where null values exist and other cool info
speeddating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 42 columns):
gender                           8378 non-null int64
d_age                            8378 non-null int64
samerace                         8378 non-null int64
importance_same_race             8299 non-null float64
field                            8315 non-null object
pref_o_attractive                8289 non-null float64
pref_o_sincere                   8289 non-null float64
pref_o_intelligence              8289 non-null float64
pref_o_funny                     8280 non-null float64
pref_o_ambitious                 8271 non-null float64
pref_o_shared_interests          8249 non-null float64
attractive_o                     8166 non-null float64
sinsere_o                        8091 non-null float64
intelligence_o                   8072 non-null float64
funny_o                          8018 non-null float64
ambitous_o                       7656 non-null float64
shared_interests_o  

In [7]:
# plug in an feature name to see how often each possible feature value occurs
#speeddating["field"].value_counts()

In [8]:
# some useful stats on the dataset
#speeddating.describe()

In [9]:
# make histograms for all the features
#speeddating.hist(bins=50, figsize=(20,15))
#plt.show()

In [10]:
# make a single histogram for a particular feature
#speeddating[""].hist(bins=50, figsize=(20,15))
#plt.show()

In [11]:
# contains null values and since it is text, they cannot be replaced
speeddating = speeddating.drop('field', axis=1)

In [12]:
from sklearn.model_selection import train_test_split

sd_train, sd_test = train_test_split(speeddating, test_size=0.2, random_state=42)

In [13]:
# loop by column 
#for col in speeddating:
#    print (col)

In [14]:
# Replace NULL values with the median
#for col in speeddating:
#    # field feature doesn't have a median - it's an object
#    if(col != 'field'):
#        median = speeddating[col].median()
#        speeddating[col].fillna(median, inplace=True)
#speeddating.info()

In [15]:
corr_matrix = speeddating.corr()

In [16]:
corr_matrix["match"].sort_values(ascending=False)

match                            1.000000
like                             0.305723
funny_o                          0.277700
funny_partner                    0.277545
shared_interests_o               0.270840
shared_interests_partner         0.270679
attractive_o                     0.260837
attractive_partner               0.260676
guess_prob_liked                 0.255531
intelligence_partner             0.169532
intelligence_o                   0.169448
sinsere_o                        0.164523
sincere_partner                  0.164449
expected_num_interested_in_me    0.143755
ambition_partner                 0.140042
ambitous_o                       0.139927
expected_num_matches             0.130592
met                              0.100427
intelligence                     0.051066
funny_important                  0.041652
pref_o_funny                     0.041483
attractive                       0.036439
interests_correlate              0.031121
expected_happy_with_sd_people    0

In [63]:
# create train and test data and targets/labels (not cleaned)
sd_train_x = sd_train.drop('match', axis=1)
sd_train_y = sd_train['match'].copy()
sd_test_x = sd_test.drop('match', axis=1)
sd_test_y = sd_test['match'].copy()

In [64]:
# convert to numpy arrays to work with models
y_train_sd = pd.DataFrame.to_numpy(sd_train_y)
print(sd_train_y.shape)
y_test_sd = pd.DataFrame.to_numpy(sd_test_y)
print(sd_test_y.shape)

(6702,)
(1676,)


In [18]:
# use sklearn's imputer to replace missing values
# this will store the median of each feature in its
# statistics_ variable
from sklearn.impute import SimpleImputer

In [19]:
# using just the imputer
#imputer = SimpleImputer(strategy='median')

In [20]:
#imputer.fit(sd_train_x)

In [21]:
#imputer.statistics_

In [22]:
#sd_train_x.median().values

In [23]:
# makes a plain numpy array
#X = imputer.transform(sd_train_x)

In [24]:
# put it back into a pandas dataframe (if you want)
#X_train = pd.DataFrame(X, columns=sd_train_x.columns)

In [25]:
#X_train.info()

In [26]:
# for creating a pipeline to pass data through in a a specific order
# data -> imputer -> std_scaler -> cleaned and standardized data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

In [27]:
X_train_sd = pipeline.fit_transform(sd_train_x)
X_test_sd = pipeline.fit_transform(sd_test_x)

In [28]:
X_train_sd.shape

(6702, 40)

In [29]:
X_test_sd.shape

(1676, 40)

In [None]:
from sklearn.linear_model import SGDClassifier

In [360]:
sgd_clf = SGDClassifier(max_iter=50, tol=-np.infty, random_state=42, loss='log', penalty='elasticnet')
sgd_clf.fit(X_train_sd, y_train_sd)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=42, shuffle=True, tol=-inf,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [361]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train_sd, y_train_sd, cv=10, scoring='accuracy')

array([0.84947839, 0.84053651, 0.86736215, 0.8641791 , 0.85671642,
       0.84029851, 0.86268657, 0.85522388, 0.86119403, 0.87892377])

In [362]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=10, random_state=42)

for train_index, test_index in skfolds.split(X_train_sd, y_train_sd):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train_sd[train_index]
    y_train_folds = (y_train_sd[train_index])
    X_test_fold = X_train_sd[test_index]
    y_test_fold = (y_train_sd[test_index])
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct/len(y_pred))

0.849478390461997
0.8405365126676602
0.8673621460506706
0.8641791044776119
0.8567164179104477
0.8402985074626865
0.8626865671641791
0.8552238805970149
0.8611940298507462
0.8789237668161435


In [72]:
from sklearn.model_selection import cross_val_predict

In [363]:
xval_predict_train = cross_val_predict(sgd_clf, X_train_sd, y_train_sd, cv=10)

In [364]:
xval_predict_test = cross_val_predict(sgd_clf, X_test_sd, y_test_sd, cv=10)

In [74]:
from sklearn.metrics import classification_report

In [365]:
print(classification_report(y_train_sd, xval_predict_train))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      5609
           1       0.62      0.33      0.43      1093

   micro avg       0.86      0.86      0.86      6702
   macro avg       0.75      0.65      0.68      6702
weighted avg       0.84      0.86      0.84      6702



In [366]:
print(classification_report(y_test_sd, xval_predict_test))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1389
           1       0.47      0.45      0.46       287

   micro avg       0.82      0.82      0.82      1676
   macro avg       0.68      0.67      0.68      1676
weighted avg       0.82      0.82      0.82      1676



In [77]:
from sklearn.metrics import confusion_matrix

In [367]:
confusion_matrix(y_train_sd, xval_predict_train)

array([[5384,  225],
       [ 729,  364]])

In [368]:
confusion_matrix(y_test_sd, xval_predict_test)

array([[1241,  148],
       [ 157,  130]])