## Feature Engineering Test Notebook

This notebook is being used to prerun some feature engineering steps individually that I'm concerned might take too long to run. Not all feature engineering steps are in this notebook, and the cleaned_credit_card_fraud.parquet will not be directly editted in this notebook. Instead this notebook will run some specific features to see how long it takes to process. 

In [47]:
import pandas as pd
import logging
import wandb
import os
import wandb
from geopy.geocoders import Nominatim
from time import time, sleep 
from haversine import haversine, Unit
import swifter

In [8]:
#loading cleaned dataset from WandB
run = wandb.init()
artifact = run.use_artifact('lhan122-student/credit_card_fraud/cleaned_credit_card_data:latest', type='dataset')
artifact_dir = artifact.download()
data/customer_locations.csv

wandb: Downloading large artifact cleaned_credit_card_data:latest, 85.55MB. 1 files... 
wandb:   1 of 1 files downloaded.  
Done. 0:0:9.4


In [31]:
## creating mapping for customer location 
cust_loc = df[['city', 'state']].drop_duplicates()

In [34]:
#getting lat and long values for city and state
geolocator = Nominatim(user_agent="geoapi")
start_time = time()

def get_lat_long(city, state):
    try:
        location = geolocator.geocode(f"{city}, {state}")
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        logger.error(f"Error fetching location for {city}, {state}: {e}")
        return None, None

# apply geocoding 
cust_loc['lat_long'] = cust_loc.apply(lambda x: get_lat_long(x['city'], x['state']), axis=1)
cust_loc[['cust_lat', 'cust_long']] = pd.DataFrame(cust_loc['lat_long'].tolist(), index=cust_loc.index)

end_time = time()
total_time = end_time - start_time
print(total_time)

In [36]:
#saving this locally for future use, since it takes a while to run
cust_loc.to_csv("customer_locations.csv", index=False)

In [37]:
#uploading it to WandB as well for record keeping
cust_loc_file = 'cust_loc.parquet'
df.to_parquet(cust_loc_file, index=False)

artifact = wandb.Artifact(
    name="cust_loc_data",
    type="dataset",
    description="Lat and Long values for customer's city and state.",
)
artifact.add_file(cust_loc_file)
run.log_artifact(artifact)

<Artifact cust_loc_data>

In [40]:
#adding cust lat and long to df
start_time = time()
df = df.merge(cust_loc[['city', 'state', 'cust_lat', 'cust_long']], on=['city', 'state'], how='left')
end_time = time()
total_time = end_time - start_time
print(total_time)

0.15567898750305176


In [49]:
#creating 'trans_distance' column
start_time = time()

df['trans_distance_km'] = df.swifter.apply(
    lambda row: haversine(
        (row['cust_lat'], row['cust_long']),
        (row['lat'], row['long']),
        unit=Unit.KILOMETERS
    ),
    axis=1
)
end_time = time()
total_time = end_time - start_time
print(total_time)


Pandas Apply:   0%|          | 0/1296675 [00:00<?, ?it/s]

9.180264472961426


In [50]:
df.head()

Unnamed: 0,trans_dt,cc_num,merchant,category,amt,city,state,lat,long,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_type,cust_lat,cust_long,trans_distance_km
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Moravian Falls,NC,36.0788,-81.1781,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,36.011293,-82.048315,0,1,36.102687,-81.181582,2.67443
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0,2,48.866014,-118.202781,2.487368
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.262,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0,0,42.191587,-112.250798,1.513474
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Boulder,MT,46.2306,-112.1138,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,47.034331,-112.561071,0,2,46.236595,-112.120834,0.858536
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Doe Hill,VA,38.4207,-79.4629,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,38.674999,-78.632459,0,2,38.432066,-79.444488,2.041971


In [None]:
#calculating location match

In [51]:
run.finish()