In [1]:
# data cleaning
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="myApp")


def create_labels(n):
    labels = []
    for i in range(0, len(n)-1):
        if i < len(n) -1:
            labels.append(str(n[i]) + " and " + str(n[i + 1]))
    return labels

In [2]:
speed_data = pd.read_csv("../data/raw/Speed_Dating.csv",encoding='latin')

In [3]:
speed_data.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [4]:
# transform to get one row per couple
speed_data["date_id"] = [f"{a}-{b}-{c}-{d}" for a,b,c,d in zip(speed_data.wave, speed_data.order, speed_data["round"], speed_data.position)]

In [5]:
# features

identifiers = ["date_id", "iid", "match", "wave"]

preference = ["pf_o_att", "pf_o_sin", "pf_o_int", "pf_o_fun", "pf_o_amb", "pf_o_sha"]

personal_info = ["field_cd", "age", "race", "income", "zipcode", "samerace", "goal", "from"]

behaviour = ["go_out",
           "sports", "tvsports", "exercise", "dining", "museums", "art", "hiking", "gaming",
           "clubbing", "reading", "tv", "theater", "movies", "concerts", "music", "shopping",
           "yoga", "exphappy"]

looking_for = ["attr1_1", "sinc1_1", "intel1_1", "fun1_1", "amb1_1", "imprace", "imprelig"]

self_rating = [ "attr3_1", "sinc3_1", "intel3_1", "fun3_1", "amb3_1"]

to_impute = preference + behaviour + looking_for + self_rating

to_normalise = behaviour + looking_for + self_rating + preference


In [6]:
# difference between what person b i slooking for a what person a rates themselves 1_1_b and 3_1_a
# difference between what person a i slooking for b what person a rates themselves 1_1_a and 3_1_b

In [7]:
speed_data = speed_data[identifiers + preference + personal_info + behaviour + looking_for + self_rating]

In [8]:
speed_data["from"] = speed_data["from"].replace("new york city", "New York")

df_ny_zip = speed_data[speed_data["from"] == "New York"]["zipcode"]
ny_zip = df_ny_zip.loc[df_ny_zip.index[0]]
print("A New York zipcode is: ", ny_zip)

df_nj_zip = speed_data[speed_data["from"] == "New Jersey"]["zipcode"]
nj_zip = df_nj_zip.loc[df_nj_zip.index[0]]
print("A New Jersey zipcode is: ", nj_zip)

m1 = speed_data['zipcode'].isnull()
m2 = speed_data["from"] == "New Jersey"
speed_data.loc[m1 & m2, 'zipcode'] = nj_zip 

m1 = speed_data['zipcode'].isnull()
m2 = speed_data["from"] == "New York"
speed_data.loc[m1 & m2, 'zipcode'] = ny_zip 

speed_data["zipcode"] = speed_data["zipcode"].fillna("Not applicable")

A New York zipcode is:  10,028
A New Jersey zipcode is:  7,661


In [9]:
# feature engineering
speed_data["income"] = speed_data["income"].str.replace(",", "")

speed_data["income"] = speed_data["income"].astype("float")
speed_data["income"].fillna(-1, inplace=True)
bins = np.arange(speed_data["income"].min()-1, speed_data["income"].max(), 10000)
labels = create_labels(bins)
labels[0] = "Not provided"
speed_data["income"] = pd.cut(x=speed_data["income"],bins = bins, labels = labels)
speed_data["income"]

speed_data["zipcode"] = speed_data["zipcode"].astype("str").str.replace(",", "")
speed_data["zipcode_area"] = [i[0:3] for i in speed_data["zipcode"]]
#speed_data.drop("zipcode", axis = 1, inplace = True)

In [10]:
#speed_data_no_null = speed_data[speed_data['zipcode'].str.isnumeric()]


In [11]:
from geopy.geocoders import Nominatim
import geopy
geopy.geocoders.options.default_user_agent = "my-application"
geolocator = Nominatim(user_agent="my_user_agent")

In [12]:
lat_list = []
long_list = []

for x in speed_data["zipcode"]:
    try:
        location = geolocator.geocode(x)
        lat = location.latitude
        long = location.longitude
    except:
        lat = "Invalid postcode"
        long = "Invalid postcode"
        
    lat_list.append(lat)
    long_list.append(long)

In [13]:
speed_data["latitude"] = lat_list
speed_data["longitude"] = long_list

In [14]:
for i in speed_data[to_impute]:
 speed_data.loc[speed_data.loc[:,i].isnull(),i]=speed_data.loc[:,i].mean()

In [15]:
# normalise continuous variables to be between 0 and 1
speed_data[to_normalise] = speed_data[to_normalise].apply(lambda x: (x-x.min())/ (x.max()-x.min()), axis=0)

In [16]:
count = 0

for d in speed_data.date_id.unique():
  
    if count == 0:
        speed_data_date1 = speed_data[speed_data["date_id"] == d]
        if len(speed_data_date1) == 2:
            speed_data_date1.iid = ["a", "b"]
        else:
            #print(speed_data1["date_id"])
            next
    
    if count != 0:
        speed_data_date2 = speed_data[speed_data["date_id"] == d]
        if len(speed_data_date2) == 2:
            speed_data_date2.iid = ["a", "b"]
        else:
            #print(speed_data1["date_id"])
            next    
        speed_data_date1 = pd.concat([speed_data_date1, speed_data_date2])

    count = count + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  speed_data_date1.iid = ["a", "b"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  speed_data_date2.iid = ["a", "b"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  speed_data_date2.iid = ["a", "b"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [17]:
#pd.unique(speed_data_date1.iid)
# TODO why dont all date_ids have 2 entries?

In [18]:
df = speed_data_date1[speed_data_date1["iid"] == "a"]
speed_data_date_a = df.add_suffix("_a")

df = speed_data_date1[speed_data_date1["iid"] == "b"]
speed_data_date_b = df.add_suffix("_b")

In [19]:
speed_data_date_all = pd.merge(speed_data_date_a, speed_data_date_b, left_on='date_id_a', right_on='date_id_b')

In [20]:
# match a and b are the same.

speed_data_date_all[["match_b", "match_a"]]

# drop outcome variable
speed_data_date_all.drop(["match_b"], axis = 1, inplace = True)
speed_data_date_all.rename(columns={"match_a": "match"})

# drop iid columns
speed_data_date_all.drop(["iid_a", "iid_b", "samerace_b"], axis = 1, inplace = True)

In [21]:
# age difference
speed_data_date_all["age_difference"] = abs(speed_data_date_all["age_a"] - speed_data_date_all["age_b"])

In [22]:
# difference between what person b i slooking for a what person a rates themselves 1_1_b and 3_1_a
#"attr1_1", "sinc1_1", "intel1_1", "fun1_1", "amb1_1"
#"attr3_1", "sinc3_1", "intel3_1", "fun3_1", "amb3_1"
speed_data_date_all["attr_b_a"] = abs(speed_data_date_all["attr1_1_b"] - speed_data_date_all["attr3_1_a"])
speed_data_date_all["sinc_b_a"] = abs(speed_data_date_all["sinc1_1_b"] - speed_data_date_all["sinc3_1_a"])
speed_data_date_all["intel_b_a"] = abs(speed_data_date_all["intel1_1_b"] - speed_data_date_all["intel3_1_a"])
speed_data_date_all["fun_b_a"] = abs(speed_data_date_all["fun1_1_b"] - speed_data_date_all["fun3_1_a"])
speed_data_date_all["amb_b_a"] = abs(speed_data_date_all["amb1_1_b"] - speed_data_date_all["amb3_1_a"])


# difference between what person a i slooking for b what person a rates themselves 1_1_a and 3_1_b
speed_data_date_all["attr_a_b"] = abs(speed_data_date_all["attr1_1_a"] - speed_data_date_all["attr3_1_b"])
speed_data_date_all["sinc_a_b"] = abs(speed_data_date_all["sinc1_1_a"] - speed_data_date_all["sinc3_1_b"])
speed_data_date_all["intel_a_b"] = abs(speed_data_date_all["intel1_1_a"] - speed_data_date_all["intel3_1_b"])
speed_data_date_all["fun_a_b"] = abs(speed_data_date_all["fun1_1_a"] - speed_data_date_all["fun3_1_b"])
speed_data_date_all["amb_a_b"] = abs(speed_data_date_all["amb1_1_a"] - speed_data_date_all["amb3_1_b"])


In [23]:
import math
def get_distance(x_1, x_2, y_1, y_2): 

  try:
    x = (x_1 - x_2)**2
    y = (y_1 - y_2)**2
    distance = math.sqrt(x + y)

  except:
    distance = -1

  return distance

In [24]:
speed_data_date_all['distance'] = speed_data_date_all.apply(lambda row : get_distance(row["latitude_a"], row["longitude_a"], row["latitude_b"], row["longitude_b"]), axis = 1)

In [26]:
# put distance in bins
speed_data_date_all['distance'] = speed_data_date_all['distance'].astype("float")
bins = np.arange(speed_data_date_all['distance'].min()-1, speed_data_date_all['distance'].max(), 25)
labels = create_labels(bins)
labels[0] = "No zipcode provided"
speed_data_date_all['distance_bin'] = pd.cut(x=speed_data_date_all['distance'],bins = bins, labels = labels)
speed_data_date_all['distance_bin']

0       No zipcode provided
1           123.0 and 148.0
2           198.0 and 223.0
3           123.0 and 148.0
4           148.0 and 173.0
               ...         
2909          73.0 and 98.0
2910          73.0 and 98.0
2911    No zipcode provided
2912          73.0 and 98.0
2913          73.0 and 98.0
Name: distance_bin, Length: 2914, dtype: category
Categories (9, object): ['No zipcode provided' < '23.0 and 48.0' < '48.0 and 73.0' < '73.0 and 98.0' ... '123.0 and 148.0' < '148.0 and 173.0' < '173.0 and 198.0' < '198.0 and 223.0']

In [None]:
speed_data_date_all["zipcode_area_b"] = speed_data_date_all["zipcode_area_b"].replace("Not", 0)
speed_data_date_all["zipcode_area_a"] = speed_data_date_all["zipcode_area_a"].replace("Not", 0)

In [28]:
speed_data_date_all.to_csv("../data/interim/single_row_date.csv")

In [None]:
# for columns where missing is < 30% then impute missing data

# for columns where missing is > 30%- drop