In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
filename = "data/speeddating.csv"

data = pd.read_csv(filename, sep=',', low_memory=False)

data.head()


Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,0,1,female,21,27,6,[4-6],'Asian/Pacific Islander/Asian-American',European/Caucasian-American,0,...,[0-3],[3-5],7,6,[6-8],[5-6],0,1,0,0
1,0,1,female,21,22,1,[0-1],'Asian/Pacific Islander/Asian-American',European/Caucasian-American,0,...,[0-3],[3-5],7,5,[6-8],[5-6],1,1,0,0
2,1,1,female,21,22,1,[0-1],'Asian/Pacific Islander/Asian-American','Asian/Pacific Islander/Asian-American',1,...,[0-3],[3-5],7,?,[6-8],[0-4],1,1,1,1
3,0,1,female,21,23,2,[2-3],'Asian/Pacific Islander/Asian-American',European/Caucasian-American,0,...,[0-3],[3-5],7,6,[6-8],[5-6],0,1,1,1
4,0,1,female,21,24,3,[2-3],'Asian/Pacific Islander/Asian-American','Latino/Hispanic American',0,...,[0-3],[3-5],6,6,[6-8],[5-6],0,1,1,1


In [3]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 123 entries, has_null to match
dtypes: int64(7), object(116)
memory usage: 7.9+ MB


Looking at our data, we see that we have a large number of columns (123). This is a warning for us because there can be some columns which are highly correlated or explain similar information and these are not beneficial for out study. Moreover, there can be some columns which we already know it won’t affect much the target variable. Hence, we are going to analyze the columns and decide which should stay and which no.

In [4]:
print(data.columns.tolist())

['has_null', 'wave', 'gender', 'age', 'age_o', 'd_age', 'd_d_age', 'race', 'race_o', 'samerace', 'importance_same_race', 'importance_same_religion', 'd_importance_same_race', 'd_importance_same_religion', 'field', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important', 'd_attractive_important', 'd_sincere_important', 'd_intellicence_important', 'd_funny_important', 'd_ambtition_important', 'd_shared_interests_important', 'attractive', 'sincere', '

On the one hand, there are some informative columns which don’t affect the target variable but explain some information of the date. We are going to remove these columns because are noise to our study. Those columns are: wave, position, position_o, pid, iid, id, partner, pid_o, round, has_null.

On the other hand, we have found out that there are some columns which talk about the same information and are redundant. We should remove this columns as well to avoid multicolinearity and simplify the study. Tese columns are all the collumns which have d_ at the beginning. These columns talk about the difference betwen columns.: d_age is the difference in ages, d_attractive is the difference betwen attractivness ratings, … We consider a better option to stay with the individual columns (for example age and age_o) instead of using the difference (d_age) because those columns explain more information and the difference column is inherent in the others (our network can interpret this relation). This happens with all the difference columns, so we have decided to remove them all.

In [5]:
# List of columns to delete
cols_to_drop = [
    'wave', 'position', 'position_o', 'pid', 'iid', 'id', 'partner', 'pid_o',
    'round', 'has_null', 'samerace', 'd_age', 'd_field', 'd_race',
    'd_importance_same_race', 'd_importance_same_religion',
    'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence',
    'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests',
    'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o',
    'd_ambitous_o', 'd_shared_interests_o', 'd_attractive_important',
    'd_sincere_important', 'd_intellicence_important', 'd_funny_important',
    'd_ambtition_important', 'd_shared_interests_important', 'd_attractive',
    'd_sincere', 'd_intelligence', 'd_funny', 'd_ambition',
    'd_attractive_partner', 'd_sincere_partner', 'd_intelligence_partner',
    'd_funny_partner', 'd_ambition_partner', 'd_shared_interests_partner',
    'd_sports', 'd_tvsports', 'd_exercise', 'd_dining', 'd_museums', 'd_art',
    'd_hiking', 'd_gaming', 'd_clubbing', 'd_reading', 'd_tv', 'd_theater',
    'd_movies', 'd_concerts', 'd_music', 'd_shopping', 'd_yoga',
    'd_interests_correlate', 'd_expected_happy_with_sd_people',
    'd_expected_num_interested_in_me', 'd_expected_num_matches',
    'd_like', 'd_guess_prob_liked', 'expected_num_interested_in_me',
    'expected_num_matches', 'shared_interests_partner', 'shared_interests_o'
]

data = data.drop(columns=cols_to_drop, errors='ignore')

print(f"Number of remaining columns: {data.shape[1]}")

Number of remaining columns: 61


In [6]:
# List of columns to delete
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   gender                         8378 non-null   object
 1   age                            8378 non-null   object
 2   age_o                          8378 non-null   object
 3   d_d_age                        8378 non-null   object
 4   race                           8378 non-null   object
 5   race_o                         8378 non-null   object
 6   importance_same_race           8378 non-null   object
 7   importance_same_religion       8378 non-null   object
 8   field                          8378 non-null   object
 9   pref_o_attractive              8378 non-null   object
 10  pref_o_sincere                 8378 non-null   object
 11  pref_o_intelligence            8378 non-null   object
 12  pref_o_funny                   8378 non-null   object
 13  pre

In [7]:
## Nominal atributes
obj_cols = data.select_dtypes(include='object').columns

for col in obj_cols:
    print(col, data[col].unique()[:15])

gender ['female' 'male']
age ['21' '24' '25' '23' '22' '26' '27' '30' '28' '?' '29' '34' '35' '32' '39']
age_o ['27' '22' '23' '24' '25' '30' '28' '21' '26' '29' '?' '39' '32' '34' '35']
d_d_age ['[4-6]' '[0-1]' '[2-3]' '[7-37]']
race ["'Asian/Pacific Islander/Asian-American'" 'European/Caucasian-American'
 'Other' "'Latino/Hispanic American'" "'Black/African American'" '?']
race_o ['European/Caucasian-American' "'Asian/Pacific Islander/Asian-American'"
 "'Latino/Hispanic American'" 'Other' "'Black/African American'" '?']
importance_same_race ['2' '8' '1' '4' '7' '3' '9' '10' '?' '5' '6' '0']
importance_same_religion ['4' '5' '1' '3' '2' '8' '10' '6' '?' '7' '9']
field ['Law' 'law' 'Economics' "'Masters in Public Administration'"
 "'Masters of Social Work&Education'" 'Finance' 'Business'
 "'political science'" 'money' "'Operations Research'" "'TC [Health Ed]'"
 'Psychology' "'social work'" "'Social Work'"
 "'Speech Language Pathology'"]
pref_o_attractive ['35' '60' '19' '30' '50' '33.3

In [8]:
data.replace('?', pd.NA, inplace=True)

In [9]:
## Nulls
data.shape[0] - data.dropna().shape[0]

1912

In [10]:
null_counts = data.isnull().sum()

null_counts_with_values = null_counts[null_counts != 0]

print(null_counts_with_values)

age                               95
age_o                            104
race                              63
race_o                            73
importance_same_race              79
importance_same_religion          79
field                             63
pref_o_attractive                 89
pref_o_sincere                    89
pref_o_intelligence               89
pref_o_funny                      98
pref_o_ambitious                 107
pref_o_shared_interests          129
attractive_o                     212
sinsere_o                        287
intelligence_o                   306
funny_o                          360
ambitous_o                       722
attractive_important              79
sincere_important                 79
intellicence_important            79
funny_important                   89
ambtition_important               99
shared_interests_important       121
attractive                       105
sincere                          105
intelligence                     105
f

In [11]:
data.describe()

Unnamed: 0,decision,decision_o,match
count,8378.0,8378.0,8378.0
mean,0.419909,0.419551,0.164717
std,0.493573,0.493515,0.370947
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,1.0,1.0,0.0
max,1.0,1.0,1.0


In [12]:
for col in obj_cols:
    print(col, data[col].unique()[:10])

gender ['female' 'male']
age ['21' '24' '25' '23' '22' '26' '27' '30' '28' <NA>]
age_o ['27' '22' '23' '24' '25' '30' '28' '21' '26' '29']
d_d_age ['[4-6]' '[0-1]' '[2-3]' '[7-37]']
race ["'Asian/Pacific Islander/Asian-American'" 'European/Caucasian-American'
 'Other' "'Latino/Hispanic American'" "'Black/African American'" <NA>]
race_o ['European/Caucasian-American' "'Asian/Pacific Islander/Asian-American'"
 "'Latino/Hispanic American'" 'Other' "'Black/African American'" <NA>]
importance_same_race ['2' '8' '1' '4' '7' '3' '9' '10' <NA> '5']
importance_same_religion ['4' '5' '1' '3' '2' '8' '10' '6' <NA> '7']
field ['Law' 'law' 'Economics' "'Masters in Public Administration'"
 "'Masters of Social Work&Education'" 'Finance' 'Business'
 "'political science'" 'money' "'Operations Research'"]
pref_o_attractive ['35' '60' '19' '30' '50' '33.33' '100' '15' '45' '20']
pref_o_sincere ['20' '0' '18' '5' '10' '15' '11.11' '25' '18.18' '35']
pref_o_intelligence ['20' '0' '19' '15' '30' '25' '11.11

In [13]:
data.gender.value_counts()

gender
male      4194
female    4184
Name: count, dtype: int64

In [14]:
## Duplicates
duplicates = data.duplicated()
print(df[duplicates])

NameError: name 'df' is not defined

In [None]:
## Nulls
data.shape[0] - data.dropna().shape[0]