In [1]:
# 1. Data Preparation:

# a. Load the dataset, and provide an overview of the available features, including transaction

# details, customer information, and labels (fraudulent or non-fraudulent).

import pandas as pd

# Load the dataset
data = pd.read_csv('card_transdata.csv')

# Display a summary of the dataset
data

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


In [2]:
# b. Describe the class distribution of fraudulent and non-fraudulent transactions and discuss the imbalance issue.

# Count the number of fraudulent and non-fraudulent transactions
class_distribution = data['fraud'].value_counts()
print(class_distribution)


0.0    912597
1.0     87403
Name: fraud, dtype: int64


In [3]:
# 2. Initial Logistic Regression Model:

# a. Implement a basic logistic regression model using the raw dataset.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into features (X) and the target variable (y)
X = data.drop(columns=['fraud'])
y = data['fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)


In [4]:
# b. Evaluate the model's performance using standard metrics like accuracy, precision, recall, and F1-score.

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.96
Precision: 0.89
Recall: 0.60
F1 Score: 0.72


In [11]:
# 3. Feature Engineering:

# a. Apply feature engineering techniques to enhance the predictive power of the model. These techniques may include:

# - Creating new features.

# - Scaling or normalizing features.

# - Handling missing values.

# - Encoding categorical variables.

# Example: Create a new feature for transaction frequency
data['transaction_frequency'] = data.groupby('distance_from_home')['distance_from_last_transaction'].transform('count')

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data['normalized_amount'] = scaler.fit_transform(data['distance_from_home'].values.reshape(-1, 1))

In [13]:
# Check the column names in your DataFrame
print(data.columns)

# Replace 'missing_column' with the actual column name
column_name = 'repeat_retailer'  # Replace with the correct column name
data[column_name].fillna(data[column_name].mean(), inplace=True)

Index(['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip',
       'used_pin_number', 'online_order', 'fraud', 'transaction_frequency',
       'normalized_amount'],
      dtype='object')


In [14]:
# Example: One-hot encoding a categorical variable
data = pd.get_dummies(data, columns=['used_chip'], drop_first=True)

In [9]:
# 4. Handling Imbalanced Data:

# a. Discuss the challenges associated with imbalanced datasets in the context of fraud detection.

from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [4]:
# 5. Logistic Regression with Feature-Engineered Data:

# a. Train a logistic regression model using the feature-engineered dataset and the methods for handling imbalanced data.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(solver='lbfgs', max_iter=1000)  # You can try different solvers
model.fit(X_train_scaled, y_resampled)
y_pred = model.predict(X_test_scaled)

NameError: name 'X_resampled' is not defined

In [10]:
6. Model Interpretation:

a. Interpret the coefficients of the logistic regression model and discuss which features have the most influence on fraud detection.

coefficients = model.coef_
feature_names = X.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients[0]})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

print(coef_df)


                          Feature  Coefficient
6                    online_order     6.636253
2  ratio_to_median_purchase_price     0.858778
1  distance_from_last_transaction     0.025417
0              distance_from_home     0.015107
3                 repeat_retailer    -0.613088
4                       used_chip    -1.039363
5                 used_pin_number   -13.213527
