In [330]:
import pandas as pd

In [331]:
df = pd.read_csv('credit_card_fraud_dataset.csv')

In [332]:
df

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.00,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.40,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0
...,...,...,...,...,...,...,...
99995,99996,2024-06-07 00:57:36.027591,1057.29,289,refund,San Antonio,0
99996,99997,2023-10-22 23:12:36.027594,297.25,745,refund,San Antonio,0
99997,99998,2024-05-31 19:27:36.027597,3448.56,690,purchase,San Antonio,0
99998,99999,2024-10-18 09:43:36.027601,3750.79,644,purchase,Philadelphia,0


In [333]:
df = df.drop(columns=['TransactionDate'])


In [334]:
df

Unnamed: 0,TransactionID,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,4189.27,688,refund,San Antonio,0
1,2,2659.71,109,refund,Dallas,0
2,3,784.00,394,purchase,New York,0
3,4,3514.40,944,purchase,Philadelphia,0
4,5,369.07,475,purchase,Phoenix,0
...,...,...,...,...,...,...
99995,99996,1057.29,289,refund,San Antonio,0
99996,99997,297.25,745,refund,San Antonio,0
99997,99998,3448.56,690,purchase,San Antonio,0
99998,99999,3750.79,644,purchase,Philadelphia,0


In [335]:
fraud_counts = df['IsFraud'].value_counts().sort_index()

# Calculate total
total = fraud_counts.sum()

for val in [0, 1]:
    count = fraud_counts.get(val, 0)
    percentage = (count / total) * 100
    print(f"Value {val}: Count = {count}, Percentage = {percentage:.2f}%")

Value 0: Count = 99000, Percentage = 99.00%
Value 1: Count = 1000, Percentage = 1.00%


In [336]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [337]:

# Step 2: Drop 'TransactionID' (not useful for modeling)
df.drop(columns=['TransactionID'], inplace=True)

In [338]:
# Step 3: Handle missing values
df.dropna(inplace=True)

In [339]:
# Step 4: Encode Categorical Features
categorical_cols = ['TransactionType', 'Location']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [340]:
from sklearn.feature_selection import f_classif
import pandas as pd

# Step 1: Subset the dataframe to include only the numeric columns for features and the target 'IsFraud'
X = df.select_dtypes(include=['number']).drop(columns=['IsFraud'])  # Select only numeric columns and drop 'IsFraud'
y = df['IsFraud']  # Target variable

# Step 2: Apply ANOVA F-test for classification
f_values, p_values = f_classif(X, y)

# Step 3: Create a DataFrame to display the results
anova_results = pd.DataFrame({
    'Feature': X.columns,  # Use the columns of X (features)
    'F-Value': f_values,
    'P-Value': p_values
})

# Step 4: Print results
print(anova_results)

# Step 5: Optional: Filter features based on p-value (e.g., p < 0.05 means significant)
significant_features = anova_results[anova_results['P-Value'] < 0.05]
print("\nSignificant features (p < 0.05):")
print(significant_features)



           Feature   F-Value   P-Value
0           Amount  0.195592  0.658303
1       MerchantID  0.133963  0.714359
2  TransactionType  0.130811  0.717593
3         Location  0.019693  0.888399

Significant features (p < 0.05):
Empty DataFrame
Columns: [Feature, F-Value, P-Value]
Index: []


In [341]:
df

Unnamed: 0,Amount,MerchantID,TransactionType,Location,IsFraud
0,4189.27,688,1,7,0
1,2659.71,109,1,1,0
2,784.00,394,0,4,0
3,3514.40,944,0,5,0
4,369.07,475,0,6,0
...,...,...,...,...,...
99995,1057.29,289,1,7,0
99996,297.25,745,1,7,0
99997,3448.56,690,0,7,0
99998,3750.79,644,0,5,0


In [342]:
# Step 5: seperation for evaluation
X = df.drop(columns=['IsFraud'])  
y = df['IsFraud'] 

In [343]:
# Step 2: Check the shape of your data (X, y)
print(f"Initial data shape: X: {X.shape}, y: {y.shape}")

Initial data shape: X: (100000, 4), y: (100000,)


In [344]:
from sklearn.model_selection import train_test_split
# Step 3: Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [345]:
# Step 4: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [346]:
X_train.shape

(70000, 4)

In [347]:
X_test.shape

(30000, 4)

In [348]:
# Step 5: Train an Isolation Forest 
model = IsolationForest(n_estimators=200, max_samples=0.8, contamination=0.01, random_state=42)
model.fit(X_train_scaled)

In [349]:
# Step 6: Make predictions
y_pred = model.predict(X_test_scaled)

In [350]:
# Step 8: Convert prediction to 0/1 (1=fraud, -1=normal)
y_pred_binary = [1 if val == -1 else 0 for val in y_pred]

In [351]:
X_test['PredictedIsFraud'] = y_pred_binary
X_test['ActualIsFraud'] = y_test.values  # Add the actual 'IsFraud' values from y_test

# Step 9: Display the rows with both the actual and predicted values for each record
print(X_test[['Amount', 'MerchantID', 'TransactionType', 'Location', 'ActualIsFraud', 'PredictedIsFraud']].head(10))

        Amount  MerchantID  TransactionType  Location  ActualIsFraud  \
75721  4031.22         503                1         6              1   
80184  3018.81         378                0         8              0   
19864   163.82         406                1         8              0   
76699  2548.43         525                1         0              0   
92991  3292.04         177                0         0              0   
76434  1389.96         517                0         3              0   
84004   323.51         375                0         6              0   
80917  2387.18         216                1         7              0   
60767  2483.98         937                1         8              0   
50074  3371.18         192                0         4              0   

       PredictedIsFraud  
75721                 0  
80184                 0  
19864                 0  
76699                 0  
92991                 0  
76434                 0  
84004                 0  

In [352]:
predicted_fraud_counts = X_test['PredictedIsFraud'].value_counts()

# Print the result
print("\nCount of actual fraud values (0/1) in 'IsFraud':")
print(fraud_counts)

print("\nCount of predicted fraud values (0/1) in 'PredictedIsFraud':")
print(predicted_fraud_counts)


Count of actual fraud values (0/1) in 'IsFraud':
IsFraud
0    99000
1     1000
Name: count, dtype: int64

Count of predicted fraud values (0/1) in 'PredictedIsFraud':
PredictedIsFraud
0    29612
1      388
Name: count, dtype: int64


In [353]:
# Step 8: Ensure consistent lengths between y_test and y_pred_binary
print(f"Length of y_test: {len(y_test)}")
print(f"Length of y_pred_binary: {len(y_pred_binary)}")

# Ensure that y_test and y_pred_binary have the same length
if len(y_test) != len(y_pred_binary):
    raise ValueError(f"Length mismatch: y_test length is {len(y_test)} but y_pred_binary length is {len(y_pred_binary)}")

Length of y_test: 30000
Length of y_pred_binary: 30000


In [354]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Isolation Forest model
model = IsolationForest(random_state=42).fit(X_train_scaled)

# Make predictions and convert from -1 (outlier) and 1 (normal) to 0 (normal) and 1 (fraud)
y_train_pred_binary = [1 if val == -1 else 0 for val in model.predict(X_train_scaled)]
y_test_pred_binary = [1 if val == -1 else 0 for val in model.predict(X_test_scaled)]

# Function to calculate and print metrics
def print_metrics(y_true, y_pred, dataset_name):
    print(f"{dataset_name} Metrics:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}\n")

# Print metrics for both train and test sets
print_metrics(y_train, y_train_pred_binary, "Training")
print_metrics(y_test, y_test_pred_binary, "Testing")



Training Metrics:
Accuracy: 0.1676
Precision: 0.0099
Recall: 0.8513
F1 Score: 0.0197

Testing Metrics:
Accuracy: 0.1645
Precision: 0.0097
Recall: 0.7771
F1 Score: 0.0191



In [355]:
from sklearn.metrics import confusion_matrix

# Assuming y_test and y_pred_binary are already defined

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_test_pred_binary)

# Display the confusion matrix
print(f"Confusion Matrix:\n{cm}")


Confusion Matrix:
[[ 4691 24995]
 [   70   244]]
