# **CREDIT CARD FRAUD DETECTION**

In [1]:
#Import Necessary Libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
# Load the datasets
train_data = pd.read_csv('fraudTrain-Copy1.csv')
test_data = pd.read_csv('fraudTest-Copy1.csv')

In [3]:
# Drop irrelevant columns
columns_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'dob', 'trans_num', 'unix_time', 'street']
X_train = train_data.drop(columns_to_drop + ['is_fraud'], axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop(columns_to_drop + ['is_fraud'], axis=1)
y_test = test_data['is_fraud']

In [4]:
# Display a preview of the datasets
print("Training data preview after removing irrelevant columns:")
print(X_train.head())

print("\nTest data preview after removing irrelevant columns:")
print(X_test.head())

Training data preview after removing irrelevant columns:
                             merchant       category     amt gender  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97      F   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23      F   
2                fraud_Lind-Buckridge  entertainment  220.11      M   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00      M   
4                 fraud_Keeling-Crist       misc_pos   41.96      M   

             city state    zip      lat      long  city_pop  \
0  Moravian Falls    NC  28654  36.0788  -81.1781      3495   
1          Orient    WA  99160  48.8878 -118.2105       149   
2      Malad City    ID  83252  42.1808 -112.2620      4154   
3         Boulder    MT  59632  46.2306 -112.1138      1939   
4        Doe Hill    VA  24433  38.4207  -79.4629        99   

                                 job  merch_lat  merch_long  
0          Psychologist, counselling  36.011293  -82.048315  
1  Special ed

In [5]:
# Display information about the datasets
print("Training data information:")
X_train.info()

print("\nTest data information:")
X_test.info()

Training data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 13 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   merchant    1296675 non-null  object 
 1   category    1296675 non-null  object 
 2   amt         1296675 non-null  float64
 3   gender      1296675 non-null  object 
 4   city        1296675 non-null  object 
 5   state       1296675 non-null  object 
 6   zip         1296675 non-null  int64  
 7   lat         1296675 non-null  float64
 8   long        1296675 non-null  float64
 9   city_pop    1296675 non-null  int64  
 10  job         1296675 non-null  object 
 11  merch_lat   1296675 non-null  float64
 12  merch_long  1296675 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 128.6+ MB

Test data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 13 columns):
 #   Column     

In [6]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

In [7]:
# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
# Create preprocessing and training pipeline function
def create_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=3000),
    'Decision Tree': DecisionTreeClassifier()
}

In [9]:
# Train and evaluate models
results = {}
for model_name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate the model
    results[model_name] = {
        'classification_report': classification_report(y_test, y_pred),
        'roc_auc_score': roc_auc_score(y_test, y_prob)
    }
    print(f"Model {model_name} trained and evaluated successfully.")

Model Logistic Regression trained and evaluated successfully.
Model Decision Tree trained and evaluated successfully.


In [10]:
# Display results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print("Classification Report:")
    print(result['classification_report'])
    print(f"ROC-AUC Score: {result['roc_auc_score']}\n")

Model: Logistic Regression
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.01      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

ROC-AUC Score: 0.6171699025875703

Model: Decision Tree
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.59      0.57      0.58      2145

    accuracy                           1.00    555719
   macro avg       0.79      0.78      0.79    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC Score: 0.7831537648765337

