# Importing the libraries

In [None]:
import tensorflow as tf
import pandas as pd

In [None]:
print(tf.__version__)

# Importing the dataset

In [None]:
# Dataset with all columns
dataset = pd.read_csv("Dataset/credit_card.csv")
dataset

# Data Exploration and Understanding

In [None]:
# Explore the data types of the columns
dataset.dtypes

In [None]:
dataset.describe()

In [None]:
# To explore a particular column
dataset['merchant'].describe()

In [None]:
# To explore a particular column
dataset['category'].describe()

In [None]:
# To explore a particular column
dataset['city'].describe()

In [None]:
# To explore a particular column
dataset['state'].describe()

In [None]:
# To explore a particular column
dataset['job'].describe()

In [None]:
# It returns the unique values in the column as an array.
pd.unique(dataset['category'].values)

# Data Cleaning

### (1) Removing irrelevant columns

In [None]:
# List of columns we want to keep
desired_columns = ['trans_date_trans_time', 'merchant', 'category', 'amt', 'state', 'city', 'city_pop', 'job', 'dob', 'is_fraud']

# Filter the DataFrame to keep only the desired columns
filtered = dataset[desired_columns]

# 'filtered' now contains only the specified columns, and the rest are removed
filtered

In [None]:
# filtered = dataset.drop(['lat', 'long', 'trans_num', 'merch_long', 'merch_lat'], axis=1)
# filtered

### (2) Handling missing data

In [None]:
# Use the “info()” function to have an idea about null columns.
filtered.info()

In [None]:
# Check the null values count in the filtered dataset.
filtered.isnull().sum()

### (3) Handling duplicate data 

In [None]:
# Check for duplicate rows
duplicate_rows = filtered[filtered.duplicated()]

# Print the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
# Handle duplicates
filtered.drop_duplicates(inplace=False)

# Feature Engineering

### (1) Extracting date and time from 'trans_date_trans_time'

In [None]:
# Convert the 'trans_date_trans_time_column' to a datetime data type
filtered = filtered.copy()
filtered['trans_date_trans_time'] = pd.to_datetime(filtered['trans_date_trans_time'])

In [None]:
# Split the datetime column into separate date and time columns
filtered['trans_date'] = filtered['trans_date_trans_time'].dt.date
filtered['trans_time'] = filtered['trans_date_trans_time'].dt.time

In [None]:
# Print the resulting DataFrame
print(filtered)

In [None]:
# List of columns we want to keep
desired_columns = ['trans_date', 'trans_time', 'amt', 'merchant', 'category', 'job', 'dob', 'state', 'city', 'city_pop', 'is_fraud']

# Filter the DataFrame to keep only the desired columns
filtered = filtered[desired_columns]

# 'filtered_df' now contains only the specified columns, and the rest are removed
filtered

### (2)  Encode categorical variables

In [None]:
# Encode categorical variables (using one-hot encoding)
data = pd.get_dummies(filtered, columns=['city', 'state', 'job', 'merchant', 'category'])
data

# Formatting data

In [None]:
# Create a dictionary with the mapping of old column names to new column names
column_mapping = {'amt': 'amount', 'city_pop': 'city_population'}

# Use the 'rename()' method to rename the columns
data.rename(columns=column_mapping, inplace=True)

# 'filtered' now has the columns with the new names
data

In [None]:
data.dtypes

In [None]:
#Save to a csv
data.to_csv("cleaned_dataset.csv",index=False)

# Handling Class Imbalance (using oversampling)

In [None]:
from imblearn.over_sampling import RandomOverSampler
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']

oversampler = RandomOverSampler(sampling_strategy='minority')
X_resampled, y_resampled = oversampler.fit_resample(X, y)