## Feature Engineering Test Notebook

This notebook is being used to prerun some feature engineering steps individually that I'm concerned might take too long to run. Not all feature engineering steps are in this notebook, and the cleaned_credit_card_fraud.parquet will not be directly editted in this notebook. Instead this notebook will run some specific features to see how long it takes to process. 

In [1]:
import pandas as pd
import logging
import wandb
import os
import wandb
from geopy.geocoders import Nominatim
from time import time, sleep
from haversine import haversine, Unit
import swifter

In [2]:
# grabbing cleaned dataset from WandB
run = wandb.init()
artifact = run.use_artifact(
    "lhan122-student/credit_card_fraud/cleaned_credit_card_data:latest", type="dataset"
)
artifact_dir = artifact.download()
file_path = os.path.join(artifact_dir, "cleaned_credit_card_fraud.parquet")
df = pd.read_parquet(file_path)

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: lhan122 (lhan122-student). Use `wandb login --relogin` to force relogin


wandb: Downloading large artifact cleaned_credit_card_data:latest, 85.55MB. 1 files... 
wandb:   1 of 1 files downloaded.  
Done. 0:0:0.3


In [3]:
# creating map for customer location
cust_loc = df[["city", "state"]].drop_duplicates()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   trans_dt    1296675 non-null  datetime64[ns]
 1   cc_num      1296675 non-null  int64         
 2   merchant    1296675 non-null  category      
 3   category    1296675 non-null  category      
 4   amt         1296675 non-null  float64       
 5   city        1296675 non-null  category      
 6   state       1296675 non-null  category      
 7   lat         1296675 non-null  float64       
 8   long        1296675 non-null  float64       
 9   job         1296675 non-null  category      
 10  dob         1296675 non-null  datetime64[ns]
 11  trans_num   1296675 non-null  object        
 12  merch_lat   1296675 non-null  float64       
 13  merch_long  1296675 non-null  float64       
 14  is_fraud    1296675 non-null  int64         
dtypes: category(5), datetime64[ns](2

In [4]:
# getting lat and long values for city and state
geolocator = Nominatim(user_agent="geoapi")
start_time = time()


def get_lat_long(city, state):
    try:
        location = geolocator.geocode(f"{city}, {state}")
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        logger.error(f"Error fetching location for {city}, {state}: {e}")
        return None, None


# apply geocoding
cust_loc["lat_long"] = cust_loc.apply(
    lambda x: get_lat_long(x["city"], x["state"]), axis=1
)
cust_loc[["cust_lat", "cust_long"]] = pd.DataFrame(
    cust_loc["lat_long"].tolist(), index=cust_loc.index
)

end_time = time()
total_time = end_time - start_time
print(total_time)

KeyboardInterrupt: 

In [None]:
# uploading it to WandB as well for record keeping
cust_loc_file = "cust_loc.parquet"
df.to_parquet(cust_loc_file, index=False)

artifact = wandb.Artifact(
    name="cust_loc_data",
    type="dataset",
    description="Lat and Long values for customer's city and state.",
)
artifact.add_file(cust_loc_file)
run.log_artifact(artifact)

In [None]:
# creating 'trans_distance' column to see how long it takes to process
start_time = time()

df["trans_distance_km"] = df.swifter.apply(
    lambda row: haversine(
        (row["cust_lat"], row["cust_long"]),
        (row["lat"], row["long"]),
        unit=Unit.KILOMETERS,
    ),
    axis=1,
)
end_time = time()
total_time = end_time - start_time
print(total_time)

In [34]:
df["trans_dt"] = pd.to_datetime(df["trans_dt"])

# Sort DataFrame
df = df.sort_values(["cc_num", "trans_dt"])

# Perform rolling count
df["trans_by_last_hr"] = (
    df.groupby("cc_num")
    .rolling("1H", on="trans_dt")["trans_dt"]
    .count()
    .reset_index(drop=True)  # Remove unnecessary index levels
)

In [35]:
df.head()

Unnamed: 0,trans_dt,cc_num,merchant,category,amt,city,state,lat,long,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_by_last_hr
0,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",misc_net,7.27,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,98e3dcf98101146a577f85a34e58feec,43.974711,-109.741904,0,1.0
1,2019-01-02 08:44:57,60416207185,fraud_Berge LLC,gas_transport,52.94,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,498120fc45d277f7c88e3dba79c33865,42.018766,-109.044172,0,1.0
2,2019-01-02 08:47:36,60416207185,fraud_Luettgen PLC,gas_transport,82.08,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,95f514bb993151347c7acdf8505c3d62,42.961335,-109.157564,0,2.0
3,2019-01-02 12:38:14,60416207185,fraud_Daugherty LLC,kids_pets,34.79,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,4f0c1a14e0aa7eb56a490780ef9268c5,42.228227,-108.747683,0,1.0
4,2019-01-02 13:10:46,60416207185,fraud_Beier and Sons,home,27.18,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,3b2ebd3af508afba959640893e1e82bc,43.321745,-108.091143,0,2.0


In [36]:
df["trans_dt"] = pd.to_datetime(df["trans_dt"])

# Sort DataFrame
df = df.sort_values(["cc_num", "trans_dt"])

# Perform rolling count
df["trans_by_last_hr"] = (
    df.groupby("cc_num")
    .rolling("1H", on="trans_dt")["amt"]
    .sum()
    .reset_index(drop=True)  # Remove unnecessary index levels
)

In [37]:
df.head()

Unnamed: 0,trans_dt,cc_num,merchant,category,amt,city,state,lat,long,job,dob,trans_num,merch_lat,merch_long,is_fraud,trans_by_last_hr
0,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",misc_net,7.27,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,98e3dcf98101146a577f85a34e58feec,43.974711,-109.741904,0,7.27
1,2019-01-02 08:44:57,60416207185,fraud_Berge LLC,gas_transport,52.94,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,498120fc45d277f7c88e3dba79c33865,42.018766,-109.044172,0,52.94
2,2019-01-02 08:47:36,60416207185,fraud_Luettgen PLC,gas_transport,82.08,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,95f514bb993151347c7acdf8505c3d62,42.961335,-109.157564,0,135.02
3,2019-01-02 12:38:14,60416207185,fraud_Daugherty LLC,kids_pets,34.79,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,4f0c1a14e0aa7eb56a490780ef9268c5,42.228227,-108.747683,0,34.79
4,2019-01-02 13:10:46,60416207185,fraud_Beier and Sons,home,27.18,Fort Washakie,WY,43.0048,-108.8964,Information systems manager,1986-02-17,3b2ebd3af508afba959640893e1e82bc,43.321745,-108.091143,0,61.97
