# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime

# Load Datasets

In [2]:
# data directory
data_dir = '../data/raw/'

# File paths
fraud_data_path = os.path.join(data_dir, 'Fraud_Data.csv')
ip_country_path = os.path.join(data_dir, 'IpAddress_to_Country.csv')
creditcard_path = os.path.join(data_dir, 'creditcard.csv')

# Load datasets
fraud_df = pd.read_csv(fraud_data_path)
ip_country_df = pd.read_csv(ip_country_path)
creditcard_df = pd.read_csv(creditcard_path)

# Preview Dataset

In [3]:
# Preview first 5 rows of each dataset
print("Fraud_Data.csv:")
display(fraud_df.head())

print("IpAddress_to_Country.csv:")
display(ip_country_df.head())

print("creditcard.csv:")
display(creditcard_df.head())

Fraud_Data.csv:


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


IpAddress_to_Country.csv:


Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


creditcard.csv:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Basic Info and Missing Values

In [4]:
# Fraud_Data.csv info
print("Fraud_Data info:")
fraud_df.info()
print("\nMissing values:\n", fraud_df.isnull().sum())

# IpAddress_to_Country.csv info
print("\nIpAddress_to_Country info:")
ip_country_df.info()
print("\nMissing values:\n", ip_country_df.isnull().sum())

# creditcard.csv info
print("\nCreditcard info:")
creditcard_df.info()
print("\nMissing values:\n", creditcard_df.isnull().sum())

Fraud_Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB

Missing values:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address   

# Remove Duplicates

In [5]:
# Remove duplicates from each dataset
fraud_df = fraud_df.drop_duplicates()
ip_country_df = ip_country_df.drop_duplicates()
creditcard_df = creditcard_df.drop_duplicates()

# Convert Data Types

In [6]:
# Convert time columns to datetime in fraud_df
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Save cleaned data

In [None]:
# Save cleaned data for next steps (optional)
fraud_df.to_csv('../data/cleaned/fraud_data_cleaned.csv', index=False)
ip_country_df.to_csv('../data/cleaned/ip_country_cleaned.csv', index=False)
creditcard_df.to_csv('../data/cleaned/creditcard_cleaned.csv', index=False)