# *Data Source*

### The dataset for this project is taken from Kaggle. It contains historical information about fraudulent transactions which can be used to detect fraud in online payments:                    https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset?select=PS_20174392719_1491204439457_log.csv

# *Define file path*

In [4]:
# Define the file path for the dataset
DATA_FILE = r"C:\Users\GUNA\Downloads\onlinefraud.csv.zip"  # Use 'r' for raw string to handle backslashes


# Load the Dataset from zip 

In [5]:
import pandas as pd
import zipfile
import os

# Step 1: Function to load the dataset from a ZIP file
def import_data_from_zip(zip_path, csv_filename):
    """
    This function extracts a CSV file from a ZIP archive and returns a DataFrame.
    If the ZIP file or CSV file is not found, it raises a FileNotFoundError.
    """
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extract(csv_filename)  # Extract the CSV file from the ZIP
            dataframe = pd.read_csv(csv_filename)  # Load the extracted CSV file
            print("Dataset loaded successfully.")
            return dataframe
    except FileNotFoundError as error:
        print(f"Error: {error}. Ensure the dataset is located at the provided path.")
        raise

# Load the dataset from ZIP
dataframe = import_data_from_zip(DATA_FILE, "onlinefraud.csv")  # Replace with the actual CSV name inside the ZIP if different


Dataset loaded successfully.


# *Create a Balanced Random Sample*

In [8]:
# Step 2: Function to generate a balanced random sample from the dataset
def generate_balanced_sample(df, num_samples_per_class, output_filename):
    """
    This function creates a balanced sample of fraud and non-fraud transactions.
    It saves the sampled data to a specified CSV file.
    """
    # Sample fraud and non-fraud transactions
    fraud_samples = df[df['isFraud'] == 1].sample(n=num_samples_per_class, random_state=42)
    non_fraud_samples = df[df['isFraud'] == 0].sample(n=num_samples_per_class, random_state=42)

    # Combine and shuffle the samples
    balanced_sample = pd.concat([fraud_samples, non_fraud_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Save the balanced sample to a CSV file
    balanced_sample.to_csv(output_filename, index=False)


# *Function Call to Create Balanced Sample*

In [9]:
# Create the balanced random sample
generate_balanced_sample(dataframe, 8213, "Balanced_Online_Payment_Sample.csv")


# Basic Data Overview 

In [10]:
# Step 3: Perform basic data overview
def data_overview(df):
    """
    This function provides a summary of the DataFrame including shape, datatypes,
    and basic statistics.
    """
    print("DataFrame Shape:", df.shape)
    print("\nDataFrame Info:")
    print(df.info())
    print("\nBasic Statistics:")
    print(df.describe())
    
# Call the data overview function
data_overview(dataframe)


DataFrame Shape: (6362620, 11)

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None

Basic Statistics:
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04 