# Credit Card Fraud Detection Project

In [None]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score


## Data Import

In [2]:

# Load the datasets
file_paths = {
    'transactions': 'credit_card_transaction_data_de.parquet',
    'users': 'credit_card_users_de.parquet',
    'cards': 'sd254_cards_de.parquet'
}

# Read the Parquet files
transactions_df = pd.read_parquet(file_paths['transactions'])
users_df = pd.read_parquet(file_paths['users'])
cards_df = pd.read_parquet(file_paths['cards'])

# Display the first few rows of each dataframe to understand their structure
dfs_preview = {
    'Transactions Data': transactions_df.head(),
    'Users Data': users_df.head(),
    'Cards Data': cards_df.head()
}

dfs_preview

{'Transactions Data':       User  Card  Year  Month  Day   Time  Amount          Use Chip  \
 4136     0     0  2016      1    3  10:48  $66.48  Chip Transaction   
 4137     0     0  2016      1    4  06:43  $40.02  Chip Transaction   
 4138     0     0  2016      1    7  09:30  $54.11  Chip Transaction   
 4139     0     0  2016      1    7  16:03  $89.48  Chip Transaction   
 4140     0     0  2016      1   10  06:38  $29.15  Chip Transaction   
 
             Merchant Name  Merchant City Merchant State      Zip   MCC  \
 4136 -3345936507911876459       La Verne             CA  91750.0  7538   
 4137   -34551508091458520       La Verne             CA  91750.0  5912   
 4138  4055257078481058705       La Verne             CA  91750.0  7538   
 4139  3414527459579106770  Monterey Park             CA  91754.0  5651   
 4140 -5475680618560174533  Monterey Park             CA  91755.0  5942   
 
      Errors? Is Fraud?  
 4136    None        No  
 4137    None        No  
 4138    None  

## Data Quality Assessment for Transactions Data

In [3]:
transactions_quality = {
    "Data Types": transactions_df.dtypes,
    "Missing Values": transactions_df.isnull().sum(),
    "Unique Values": transactions_df.nunique()
}

# Statistical Summary for Transactions Data
transactions_stats = transactions_df.describe(include='all', datetime_is_numeric=True)

transactions_quality, transactions_stats


({'Data Types': User                int64
  Card                int64
  Year                int64
  Month               int64
  Day                 int64
  Time               object
  Amount             object
  Use Chip           object
  Merchant Name       int64
  Merchant City      object
  Merchant State     object
  Zip               float64
  MCC                 int64
  Errors?            object
  Is Fraud?          object
  dtype: object,
  'Missing Values': User                    0
  Card                    0
  Year                    0
  Month                   0
  Day                     0
  Time                    0
  Amount                  0
  Use Chip                0
  Merchant Name           0
  Merchant City           0
  Merchant State     860764
  Zip                907586
  MCC                     0
  Errors?           6768768
  Is Fraud?               0
  dtype: int64,
  'Unique Values': User               1610
  Card                  9
  Year                  4


### Statistical Summary

In [5]:
# Statistical Summary for Numerical Columns
numerical_stats = transactions_df.describe()

# Statistical Summary for Categorical Columns
categorical_columns = transactions_df.select_dtypes(include=['object', 'category']).columns
categorical_stats = transactions_df[categorical_columns].describe()

print("\nNumerical Stats:\n", numerical_stats)
print("\nCategorical Stats:\n", categorical_stats)



Numerical Stats:
                User          Card          Year         Month           Day  \
count  6.877837e+06  6.877837e+06  6.877837e+06  6.877837e+06  6.877837e+06   
mean   1.006567e+03  1.279854e+00  2.017503e+03  6.528485e+00  1.572455e+01   
std    5.703017e+02  1.367238e+00  1.117242e+00  3.446495e+00  8.796571e+00   
min    0.000000e+00  0.000000e+00  2.016000e+03  1.000000e+00  1.000000e+00   
25%    5.170000e+02  0.000000e+00  2.017000e+03  4.000000e+00  8.000000e+00   
50%    1.011000e+03  1.000000e+00  2.018000e+03  7.000000e+00  1.600000e+01   
75%    1.487000e+03  2.000000e+00  2.019000e+03  1.000000e+01  2.300000e+01   
max    1.999000e+03  8.000000e+00  2.019000e+03  1.200000e+01  3.100000e+01   

       Merchant Name           Zip           MCC  
count   6.877837e+06  5.970251e+06  6.877837e+06  
mean   -4.896856e+17  5.096169e+04  5.558895e+03  
std     4.765167e+18  2.940545e+04  8.811638e+02  
min    -9.222899e+18  1.001000e+03  1.711000e+03  
25%    -4.5306

### Handling Imbalanced Data

In [9]:
# Defining your features and target variable
X = transactions_df.drop('Is Fraud?', axis=1)  # Assuming you've already encoded categorical variables
y = transactions_df['Is Fraud?'].map({'Yes': 1, 'No': 0})  # Converting to binary

# Applying undersampling
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Now, X_resampled and y_resampled can be used for model training


In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Imputers for numerical and categorical data
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Create transformers for categorical and numerical data
categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', numerical_imputer),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95))  # Keep 95% of variance
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Fit the model with the resampled training data
model.fit(X_train, y_train)

# Predictions and evaluations using the test data
y_pred = model.predict(X_test)


In [11]:
# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2561
           1       0.93      0.94      0.94      2487

    accuracy                           0.94      5048
   macro avg       0.94      0.94      0.94      5048
weighted avg       0.94      0.94      0.94      5048

[[2396  165]
 [ 150 2337]]


## Model Tuning

In [13]:
# Define the parameter grid
param_grid = {
    'preprocessor__num__pca__n_components': [0.95, 0.99],  # Percentage of variance to keep
    'classifier__n_estimators': [100, 200],  # Number of trees in the forest
    'classifier__max_depth': [None, 10, 20]  # Maximum depth of the tree
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='recall', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Use best_model for predictions and further analysis
y_pred = best_model.predict(X_test)

# Evaluation Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 100, 'preprocessor__num__pca__n_components': 0.99}
Accuracy: 0.9072900158478605
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.91      2561
           1       0.88      0.94      0.91      2487

    accuracy                           0.91      5048
   macro avg       0.91      0.91      0.91      5048
weighted avg       0.91      0.91      0.91      5048

Confusion Matrix:
[[2239  322]
 [ 146 2341]]


## Feature Importance

In [14]:
importances = best_model.named_steps['classifier'].feature_importances_

# You might need to adjust the following to properly align with your preprocessed feature names
feature_names = best_model.named_steps['preprocessor'].transformers_[0][2] + \
                best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out().tolist()

feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances.sort_values(ascending=False).head(10))  # Top 10 features


Merchant City_Rome             0.075669
Merchant State_None            0.061567
Use Chip_Chip Transaction      0.047838
Year                           0.046680
Merchant State_TX              0.044045
Use Chip_Online Transaction    0.041655
Merchant State_CA              0.034845
Merchant State_Italy           0.030111
Merchant City_ONLINE           0.028727
Merchant State_FL              0.025445
dtype: float64


## Cross Validation

In [15]:

# Perform cross-validation
cv_scores = cross_val_score(best_model, X_resampled, y_resampled, cv=5, scoring='recall')
print("CV Recall Scores:", cv_scores)


CV Recall Scores: [0.9411415  0.93103448 0.93701723 0.9275104  0.9411415 ]


### Interpretation of Feature Importance
- **Merchant City_Rome and Merchant State_None**: A high importance score for specific locations (like Rome) and transactions without a specified state suggests that the location of the transaction, or the lack thereof, is a significant factor in predicting fraud.
- **Use Chip_Chip Transaction and Use Chip_Online Transaction**: The method of transaction (whether it's a chip transaction or an online transaction) also plays a crucial role. This could indicate different patterns of fraud in physical versus online transactions.
- **Year**: The year of the transaction is significant, which might suggest evolving patterns of fraud over time.
- **Merchant State_TX, CA, Italy, FL**: Specific states and countries appearing as important features indicate that transactions from these locations have distinct patterns that influence the model's fraud prediction.

### Next Steps
1. **Deep Dive into Key Features**:
   - Conduct a more detailed analysis of the top features to understand their specific relationship with fraudulent transactions. For example, explore why transactions in Rome or without a specified state are more likely to be fraudulent.

2. **Model Refinement**:
   - Based on these insights, consider refining your model. For instance, you might develop specialized models for certain high-risk locations or transaction types.
   - Experiment with excluding or giving more focus to certain features and observe the impact on model performance.

3. **Temporal Analysis**:
   - Since 'Year' is an important feature, explore how fraud trends have changed over the years. This could inform strategies for updating the model periodically.

4. **Business Strategy**:
   - Share these insights with relevant stakeholders (e.g., fraud prevention teams). They could help in developing targeted strategies for fraud prevention, especially in the areas identified as high risk.
   - Consider additional measures for transactions that are flagged as high risk by these key features.

5. **Model Deployment and Monitoring**:
   - If the model is to be deployed, ensure there is a system for continuous monitoring and updating, as fraud patterns can evolve.
   - Regularly retrain the model with new data to capture the latest trends in fraudulent activity.

6. **Reporting**:
   - Prepare a detailed report or presentation for stakeholders, highlighting how these key features influence fraud prediction and what actions can be taken based on these insights.