In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('credit_card_fraud_dataset.csv')

In [3]:
print("Informasi Dataset Awal:")
print(f"Jumlah baris: {len(df)}")
print(f"Jumlah kolom: {len(df.columns)}")
print("\nKolom-kolom dataset:")
print(df.columns.tolist())

Informasi Dataset Awal:
Jumlah baris: 100000
Jumlah kolom: 7

Kolom-kolom dataset:
['TransactionID', 'TransactionDate', 'Amount', 'MerchantID', 'TransactionType', 'Location', 'IsFraud']


In [4]:
print(df.head())

   TransactionID             TransactionDate   Amount  MerchantID  \
0              1  2024-04-03 14:15:35.462794  4189.27         688   
1              2  2024-03-19 13:20:35.462824  2659.71         109   
2              3  2024-01-08 10:08:35.462834   784.00         394   
3              4  2024-04-13 23:50:35.462850  3514.40         944   
4              5  2024-07-12 18:51:35.462858   369.07         475   

  TransactionType      Location  IsFraud  
0          refund   San Antonio        0  
1          refund        Dallas        0  
2        purchase      New York        0  
3        purchase  Philadelphia        0  
4        purchase       Phoenix        0  


In [5]:
print(df.isnull().sum())

TransactionID      0
TransactionDate    0
Amount             0
MerchantID         0
TransactionType    0
Location           0
IsFraud            0
dtype: int64


In [6]:
print(df.dtypes)

TransactionID        int64
TransactionDate     object
Amount             float64
MerchantID           int64
TransactionType     object
Location            object
IsFraud              int64
dtype: object


In [7]:
print(df.describe())

       TransactionID         Amount     MerchantID        IsFraud
count  100000.000000  100000.000000  100000.000000  100000.000000
mean    50000.500000    2497.092666     501.676070       0.010000
std     28867.657797    1442.415999     288.715868       0.099499
min         1.000000       1.050000       1.000000       0.000000
25%     25000.750000    1247.955000     252.000000       0.000000
50%     50000.500000    2496.500000     503.000000       0.000000
75%     75000.250000    3743.592500     753.000000       0.000000
max    100000.000000    4999.770000    1000.000000       1.000000


In [8]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

In [9]:
df['Date'] = df['TransactionDate'].dt.date
df['Time'] = df['TransactionDate'].dt.time

In [10]:
df['Month'] = df['TransactionDate'].dt.month
df['Year'] = df['TransactionDate'].dt.year
df['DayOfWeek'] = df['TransactionDate'].dt.day_name()
df['Hour'] = df['TransactionDate'].dt.hour

In [11]:
print(df.head())

   TransactionID            TransactionDate   Amount  MerchantID  \
0              1 2024-04-03 14:15:35.462794  4189.27         688   
1              2 2024-03-19 13:20:35.462824  2659.71         109   
2              3 2024-01-08 10:08:35.462834   784.00         394   
3              4 2024-04-13 23:50:35.462850  3514.40         944   
4              5 2024-07-12 18:51:35.462858   369.07         475   

  TransactionType      Location  IsFraud        Date             Time  Month  \
0          refund   San Antonio        0  2024-04-03  14:15:35.462794      4   
1          refund        Dallas        0  2024-03-19  13:20:35.462824      3   
2        purchase      New York        0  2024-01-08  10:08:35.462834      1   
3        purchase  Philadelphia        0  2024-04-13  23:50:35.462850      4   
4        purchase       Phoenix        0  2024-07-12  18:51:35.462858      7   

   Year  DayOfWeek  Hour  
0  2024  Wednesday    14  
1  2024    Tuesday    13  
2  2024     Monday    10  
3 

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   TransactionID    100000 non-null  int64         
 1   TransactionDate  100000 non-null  datetime64[ns]
 2   Amount           100000 non-null  float64       
 3   MerchantID       100000 non-null  int64         
 4   TransactionType  100000 non-null  object        
 5   Location         100000 non-null  object        
 6   IsFraud          100000 non-null  int64         
 7   Date             100000 non-null  object        
 8   Time             100000 non-null  object        
 9   Month            100000 non-null  int32         
 10  Year             100000 non-null  int32         
 11  DayOfWeek        100000 non-null  object        
 12  Hour             100000 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(5)
memory usag

In [13]:
def get_time_period(hour):
    if 6 <= hour < 12:
        return 'Pagi'
    elif 12 <= hour < 18:
        return 'Siang'
    elif 18 <= hour < 24:
        return 'Malam'
    else:
        return 'Dini Hari'

df['TimePeriod'] = df['Hour'].apply(get_time_period)

In [14]:
df['IsWeekend'] = df['DayOfWeek'].apply(lambda day: 1 if day in ['Saturday', 'Sunday'] else 0)

In [15]:
customer_avg_amount = df.groupby('MerchantID')['Amount'].transform('mean')
df['AvgTransactionAmount'] = customer_avg_amount
df['AmountRatio'] = df['Amount'] / df['AvgTransactionAmount']

In [16]:
df['TimeDiff'] = df.groupby('MerchantID')['TransactionDate'].diff().dt.total_seconds().fillna(0)

In [17]:
daily_transactions = df.groupby(['MerchantID', 'Date']).size().reset_index(name='DailyTransactions')
monthly_transactions = df.groupby(['MerchantID', 'Year', 'Month']).size().reset_index(name='MonthlyTransactions')

df = pd.merge(df, daily_transactions, on=['MerchantID', 'Date'], how='left')
df = pd.merge(df, monthly_transactions, on=['MerchantID', 'Year', 'Month'], how='left')

In [18]:
print(df.head())

   TransactionID            TransactionDate   Amount  MerchantID  \
0              1 2024-04-03 14:15:35.462794  4189.27         688   
1              2 2024-03-19 13:20:35.462824  2659.71         109   
2              3 2024-01-08 10:08:35.462834   784.00         394   
3              4 2024-04-13 23:50:35.462850  3514.40         944   
4              5 2024-07-12 18:51:35.462858   369.07         475   

  TransactionType      Location  IsFraud        Date             Time  Month  \
0          refund   San Antonio        0  2024-04-03  14:15:35.462794      4   
1          refund        Dallas        0  2024-03-19  13:20:35.462824      3   
2        purchase      New York        0  2024-01-08  10:08:35.462834      1   
3        purchase  Philadelphia        0  2024-04-13  23:50:35.462850      4   
4        purchase       Phoenix        0  2024-07-12  18:51:35.462858      7   

   Year  DayOfWeek  Hour TimePeriod  IsWeekend  AvgTransactionAmount  \
0  2024  Wednesday    14      Siang   

In [19]:
df.to_csv('credit_card_fraud.cleaning.csv', index=False)