# Preparing the Speed Dating Dataset

<b> Download and load the dataset </b>

In [1]:
# import package
import pandas as pd

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter11/dataset/Speed_Dating_Data.csv'

In [3]:
df = pd.read_csv(url_path)
df.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


<b> Print out the dimensions of the DataFrame </b>

In [4]:
df.shape

(8378, 195)

<b> Check for duplicate rows </b>

In [5]:
df.duplicated().sum()

0

<b> Check for duplicate rows for the identifier columns (iid, id, partner, and pid) </b>

In [6]:
df[['iid', 'id', 'partner', 'pid']].duplicated().sum()

0

<b> Check for unexpected values for the following numerical variables: 'imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', and 'yoga' </b>

If you looked at the dataset description document, then you'll know that the values of the following variables should range between 1 and 10: 'imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy', and 'satis_2'. 

In [7]:
# selecting columns of interest
scale_1_10 = ['imprace', 'imprelig', 'sports', 'tvsports', 'exercise', 'dining', \
              'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', \
              'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy', 'satis_2']

In [8]:
# create a function to help de analysis
def check_range(col, min_value, max_value):
    return (col < min_value) | (col > max_value)

In [9]:
# Test your function on the 'imprace' column
unexpected_mask = check_range(df['imprace'], 1, 10)
unexpected_mask.sum()

8

In [10]:
# create a function to inspected all the dataframe
def print_unexpected(df, col_name, unexp_mask):
    if unexp_mask.sum() > 0:
        print(col_name)
        print(unexp_mask.sum())
        print(df.loc[unexp_mask, col_name].unique())

In [11]:
# Test your function on the 'imprace' column
print_unexpected(df, 'imprace', unexpected_mask)

imprace
8
[0.]


In [12]:
# Create a function that takes a DataFrame, a list of columns, and min and max values as input parameters.
# This function will iterate through each column from the given column list, call the check_range function,
# and pass its output to the print_unexpected function

def check_ranges(df, col_list, min_value, max_value):
    for col_name in col_list:
        unexpected_mask = check_range(df[col_name], min_value, max_value)
        print_unexpected(df, col_name, unexpected_mask)

In [13]:
# Test this function with the dataset and the scale_1_10 variables 
check_ranges(df, scale_1_10, 1, 10)

imprace
8
[0.]
museums
18
[0.]
art
18
[0.]
hiking
18
[0.]
gaming
137
[14.  0.]
clubbing
18
[0.]
reading
51
[13.]
theater
18
[0.]
movies
18
[0.]
concerts
18
[0.]
yoga
36
[0.]


<b> Replace the identified incorrect values </b>

In [14]:
# create a function to correct the dataframe
def replace_value(df, col_name, incorrect_value, new_value):
    df.loc[df[col_name] == incorrect_value, col_name] = new_value
    print(col_name)
    print(df[col_name].unique())

In [15]:
# using the function in the 'gaming' column
replace_value(df, 'gaming', 14, 10)

gaming
[ 1.  5.  4.  6.  2.  3.  7.  8. 10. nan  9.  0.]


In [16]:
# using the function in the 'reading' column
replace_value(df, 'reading', 13, 10)

reading
[ 6. 10.  7.  9.  8.  4.  5. nan  2.  3.  1.]


In [17]:
# Create a for loop that will iterate through the following suffixes:
# ['1_1', '1_2', '1_3', '1_s', '2_1', '2_2', '2_3', '4_1', '4_2', '4_3', '7_2', and '7_3'].
# For each of them, create a list comprehension so that you can extract the columns that contain
# the given suffix and store them into a variable called suffix_cols.
# Then, apply the check_ranges function to this list and use 0 and 100 as their minimum and maximum values

for suffix in ['1_1', '1_2', '1_3', '1_s', '2_1', '2_2', '2_3', '4_1', '4_2', '4_3', '7_2', '7_3']:
    suffix_cols = [col for col in df.columns if col.endswith(suffix)]
    check_ranges(df, suffix_cols, 0, 100)

No output is displayed, which means that all these columns have values within the expected range, that is, between 0 and 100.

In [18]:
# Create a for loop that's similar to the above for the suffixes ['3_1', '3_2', '3_3', '5_1', '5_2', '5_3', '3_s']
# where 1 and 10 are the minimum and maximum values

for suffix in ['3_1', '3_2', '3_3', '5_1', '5_2', '5_3', '3_s']:
    suffix_cols = [col for col in df.columns if col.endswith(suffix)]
    check_ranges(df, suffix_cols, 1, 10)

attr3_3
112
[12.]
sinc3_3
173
[12.]
intel3_3
233
[12.]
fun3_3
153
[12.]
amb3_3
147
[12.]


In [19]:
# Create a for loop that iterates through the list of columns ending with 3_3 and call the replace_values
for col in ['attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3']:
    replace_value(df, col, 12, 10)

attr3_3
[ 5.  7. nan  6.  4.  9.  8.  3. 10.  2.]
sinc3_3
[ 7.  6. nan  5.  8.  9. 10.  4.  3.  2.]
intel3_3
[ 7.  9. nan  6. 10.  8.  5.  4.  3.]
fun3_3
[ 7.  9. nan  8.  6.  3.  5. 10.  2.  4.]
amb3_3
[ 7.  4. nan  5. 10.  9.  8.  6.  2.  3.  1.]


<b> Check the data type of the different columns </b>

In [20]:
df.dtypes

iid           int64
id          float64
gender        int64
idg           int64
condtn        int64
             ...   
attr5_3     float64
sinc5_3     float64
intel5_3    float64
fun5_3      float64
amb5_3      float64
Length: 195, dtype: object

<b> Change the data types to categorical for the columns that don't contain numerical values </b>

In [21]:
num_cols = ['round', 'order', 'int_corr', 'age', 'mn_sat', 'income', 'expnum']

In [22]:
cat_cols = df.columns.difference(num_cols)

In [23]:
for col_name in cat_cols:
    df[col_name] = df[col_name].astype('category')

In [24]:
df.dtypes

iid         category
id          category
gender      category
idg         category
condtn      category
              ...   
attr5_3     category
sinc5_3     category
intel5_3    category
fun5_3      category
amb5_3      category
Length: 195, dtype: object

<b> Check for any missing values for each numerical variable </b>

In [25]:
df[num_cols].isna().sum()

round          0
order          0
int_corr     158
age           95
mn_sat      5245
income      4099
expnum      6578
dtype: int64

<b> Replace the missing values for each numerical variable with their corresponding mean or median values </b>

In [26]:
df['int_corr'].unique()

array([ 0.14,  0.54,  0.16,  0.61,  0.21,  0.25,  0.34,  0.5 ,  0.28,
       -0.36,  0.29,  0.18,  0.1 , -0.21,  0.32,  0.73,  0.6 ,  0.07,
        0.11,  0.39, -0.24, -0.14,  0.09, -0.04, -0.3 , -0.26, -0.15,
       -0.47, -0.18,  0.05,  0.37,  0.35,  0.15, -0.19, -0.43,  0.  ,
       -0.17,  0.08, -0.16,  0.06, -0.05, -0.13, -0.06,  0.33, -0.51,
        0.12,  0.19,  0.47,  0.03,  0.46,  0.43,  0.52, -0.46, -0.27,
        0.59,  0.31, -0.34, -0.03, -0.11,  0.42, -0.4 , -0.23,  0.17,
        0.68, -0.01, -0.35,  0.3 ,  0.65,  0.24,  0.41,  0.49,  0.01,
        0.22, -0.08,  0.27,  0.44,  0.62, -0.2 , -0.02, -0.33, -0.52,
       -0.1 ,  0.58, -0.57, -0.31, -0.07, -0.32,  0.04, -0.12,  0.48,
       -0.22, -0.29,  0.38,  0.53, -0.38,  0.02, -0.28,  0.13,  0.2 ,
         nan, -0.41, -0.44,  0.51, -0.48,  0.4 ,  0.26,  0.77, -0.49,
       -0.25, -0.09,  0.45, -0.39,  0.83,  0.57, -0.61,  0.72, -0.37,
        0.23, -0.58,  0.8 , -0.56,  0.63, -0.63,  0.71,  0.36,  0.56,
        0.55,  0.76,

The values of the int_corr column range between -1 and 1. It seems like they have been normalized. Since there are no extreme values or outliers, you can impute the missing values with the mean of this variable.

In [27]:
int_corr_mask = df['int_corr'].isna()
int_corr_mask.sum()

158

In [28]:
int_corr_mean = df['int_corr'].mean()
int_corr_mean

0.19600973236009664

In [29]:
df['int_corr'].fillna(int_corr_mean, inplace=True)

In [30]:
df['int_corr'].isna().sum()

0

In [31]:
missing_num_cols = ['age', 'mn_sat', 'income', 'expnum']

In [32]:
for col_name in missing_num_cols:
    print(col_name)
    print(df[col_name].unique())

age
[21. 24. 25. 23. 22. 26. 27. 30. 28. nan 29. 34. 35. 32. 39. 20. 19. 18.
 37. 33. 36. 31. 42. 38. 55.]
mn_sat
[  nan 1070. 1258. 1400. 1290. 1460. 1430. 1215. 1330. 1450. 1155. 1140.
 1360. 1402. 1250. 1210. 1220. 1410. 1260. 1380. 1030. 1309. 1308. 1050.
 1100. 1310. 1490. 1188. 1097. 1212. 1340. 1034. 1185. 1242. 1160. 1099.
 1214. 1270. 1110. 1178. 1060. 1157. 1180. 1014. 1341.  990. 1320. 1159.
 1370. 1105. 1365. 1011. 1130. 1206. 1331. 1191.  914. 1200. 1080. 1090.
 1092. 1470. 1149. 1134. 1230. 1267. 1280. 1227. 1239.]
income
[ 69487.  65929.     nan  37754.  86340.  60304.  54620.  48652.  29237.
  56580.  36782.  38548.  52010.  28418.  43185.  23152.  43664.  48441.
  61152.  36485.  41507.  17134.  30038.  33772.  24997.  42096.  28891.
  62635.  12063.  29809.  26482.  30147.  39919.  41466.  23988.  28989.
  50948.  38022.  47559.  53539.  32159.  53940.  40753.  38207.  46166.
  30973.  28317.  26645.  25589.  55223. 109031.  40409.  21597.  76624.
  35968.  51725.  55

In [33]:
for col_name in missing_num_cols:
    col_median = df[col_name].median()
    df[col_name].fillna(col_median, inplace=True)
    print(col_name)
    print(col_median)

age
26.0
mn_sat
1310.0
income
43185.0
expnum
4.0


In [34]:
df[missing_num_cols].isna().sum()

age       0
mn_sat    0
income    0
expnum    0
dtype: int64