In [1]:
import pandas as pd
import logging
import wandb
import os
import wandb

In [8]:
run = wandb.init()
artifact = run.use_artifact('lhan122-student/credit_card_fraud/cleaned_credit_card_data:latest', type='dataset')
artifact_dir = artifact.download()

wandb: Downloading large artifact cleaned_credit_card_data:latest, 85.55MB. 1 files... 
wandb:   1 of 1 files downloaded.  
Done. 0:0:9.4


In [9]:
df = pd.read_parquet('cleaned_credit_card_fraud.parquet')

In [10]:
df.head()

Unnamed: 0,trans_dt,cc_num,merchant,category,amt,city,state,lat,long,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Moravian Falls,NC,36.0788,-81.1781,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.262,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Boulder,MT,46.2306,-112.1138,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Doe Hill,VA,38.4207,-79.4629,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,38.674999,-78.632459,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   trans_dt    1296675 non-null  datetime64[ns]
 1   cc_num      1296675 non-null  int64         
 2   merchant    1296675 non-null  category      
 3   category    1296675 non-null  category      
 4   amt         1296675 non-null  float64       
 5   city        1296675 non-null  category      
 6   state       1296675 non-null  category      
 7   lat         1296675 non-null  float64       
 8   long        1296675 non-null  float64       
 9   job         1296675 non-null  category      
 10  dob         1296675 non-null  datetime64[ns]
 11  trans_num   1296675 non-null  object        
 12  merch_lat   1296675 non-null  float64       
 13  merch_long  1296675 non-null  float64       
 14  is_fraud    1296675 non-null  int64         
dtypes: category(5), datetime64[ns](2

In [13]:
#creating new transaction type column 
transaction_map = {
    'shopping_net': 1, 'misc_net': 1, 'grocery_net': 1,
    'grocery_pos': 2,
    'shopping_pos': 2,
    'misc_pos': 2,
    'gas_transport': 2,
    'home': 0,
    'kids_pets': 0,
    'entertainment': 0,
    'food_dining': 0,
    'personal_care': 0,
    'health_fitness': 0,
    'travel': 0
}

In [14]:
df['trans_type'] = df['category'].map(transaction_map).astype('category')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 16 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   trans_dt    1296675 non-null  datetime64[ns]
 1   cc_num      1296675 non-null  int64         
 2   merchant    1296675 non-null  category      
 3   category    1296675 non-null  category      
 4   amt         1296675 non-null  float64       
 5   city        1296675 non-null  category      
 6   state       1296675 non-null  category      
 7   lat         1296675 non-null  float64       
 8   long        1296675 non-null  float64       
 9   job         1296675 non-null  category      
 10  dob         1296675 non-null  datetime64[ns]
 11  trans_num   1296675 non-null  object        
 12  merch_lat   1296675 non-null  float64       
 13  merch_long  1296675 non-null  float64       
 14  is_fraud    1296675 non-null  int64         
 15  trans_type  1296675 non-null  ca

In [19]:
df['trans_type'].unique()

[1, 2, 0]
Categories (3, int64): [0, 1, 2]

In [31]:
## creating mapping for customer location 
cust_loc = df[['city', 'state']].drop_duplicates()

In [32]:
cust_loc

Unnamed: 0,city,state
0,Moravian Falls,NC
1,Orient,WA
2,Malad City,ID
3,Boulder,MT
4,Doe Hill,VA
...,...,...
1186804,Nicholson,PA
1190005,Lockhart,TX
1230734,Moss Point,MS
1258483,Queen Anne,MD


In [34]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")
def get_lat_long(city, state):
    try:
        location = geolocator.geocode(f"{city}, {state}")
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        logger.error(f"Error fetching location for {city}, {state}: {e}")
        return None, None

# apply geocoding 
cust_loc['lat_long'] = cust_loc.apply(lambda x: get_lat_long(x['city'], x['state']), axis=1)
cust_loc[['cust_lat', 'cust_long']] = pd.DataFrame(cust_loc['lat_long'].tolist(), index=cust_loc.index)

In [35]:
cust_loc

Unnamed: 0,city,state,lat_long,cust_lat,cust_long
0,Moravian Falls,NC,"(36.102686500000004, -81.18158238008564)",36.102687,-81.181582
1,Orient,WA,"(48.8660142, -118.2027814)",48.866014,-118.202781
2,Malad City,ID,"(42.1915872, -112.250798)",42.191587,-112.250798
3,Boulder,MT,"(46.236595, -112.120834)",46.236595,-112.120834
4,Doe Hill,VA,"(38.4320658, -79.4444882)",38.432066,-79.444488
...,...,...,...,...,...
1186804,Nicholson,PA,"(41.6261882, -75.78047)",41.626188,-75.780470
1190005,Lockhart,TX,"(29.8832105, -97.6736292)",29.883211,-97.673629
1230734,Moss Point,MS,"(30.4115881, -88.5344601)",30.411588,-88.534460
1258483,Queen Anne,MD,"(38.919559, -75.9528827)",38.919559,-75.952883


In [36]:
#saving this for future use, since it take a while to run
cust_loc.to_csv("customer_locations.csv", index=False)
