<a href="https://colab.research.google.com/github/Meghana-V-B/CODSOFT/blob/main/CREDIT_CARD_FRAUD_DETECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kartik2112/fraud-detection?dataset_version_number=1...


100%|██████████| 202M/202M [00:01<00:00, 161MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1


In [None]:
import os
import pandas as pd

# Path returned by kagglehub
dataset_folder = "/root/.cache/kagglehub/datasets/kartik2112/fraud-detection/versions/1"
# Paths to train and test CSVs
train_file = os.path.join(dataset_folder, "fraudTrain.csv")
test_file = os.path.join(dataset_folder, "fraudTest.csv")

# Load train and test datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

# Inspect
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)
print(train_data.head())
print(train_data['is_fraud'].value_counts())  # target column


Train shape: (1296675, 23)
Test shape: (555719, 23)
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1   

In [None]:

drop_cols = ['Unnamed: 0', 'trans_num', 'cc_num', 'merchant', 'first', 'last',
             'street', 'job', 'dob', 'trans_date_trans_time']

# Features (X) and target (y)
X = train_data.drop(columns=drop_cols + ['is_fraud'])
y = train_data['is_fraud']


In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)


Categorical columns: Index(['city', 'state'], dtype='object')


In [None]:
# Drop high-cardinality categorical columns
X = X.drop(columns=['city', 'state'])


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize the model
hgb_model = HistGradientBoostingClassifier(max_iter=100, max_depth=10, learning_rate=0.1, random_state=42)

# Train the model
print("Training HistGradientBoostingClassifier...")
hgb_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = hgb_model.predict(X_val)
y_val_proba = hgb_model.predict_proba(X_val)[:,1]

# Evaluate
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
roc_score = roc_auc_score(y_val, y_val_proba)
print(f"ROC-AUC Score: {roc_score:.4f}")


Training HistGradientBoostingClassifier...
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.61      0.56      0.58      1501

    accuracy                           1.00    259335
   macro avg       0.80      0.78      0.79    259335
weighted avg       1.00      1.00      1.00    259335

ROC-AUC Score: 0.8855


In [None]:
# Drop irrelevant columns (same as training)
X_test = test_data.drop(columns=drop_cols + ['is_fraud'])
y_test = test_data['is_fraud']

# One-hot encode categorical variables (same as training)
X_test = pd.get_dummies(X_test, columns=['category', 'gender'], drop_first=True)

# Align test set columns with training set columns
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Scale numeric features using the same scaler
X_test_scaled = scaler.transform(X_test)


In [None]:
# Predict on test set
y_test_pred = hgb_model.predict(X_test_scaled)
y_test_proba = hgb_model.predict_proba(X_test_scaled)[:,1]

# Evaluate
from sklearn.metrics import classification_report, roc_auc_score

print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

roc_score = roc_auc_score(y_test, y_test_proba)
print(f"ROC-AUC Score on Test Set: {roc_score:.4f}")


Classification Report on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.47      0.51      0.49      2145

    accuracy                           1.00    555719
   macro avg       0.74      0.75      0.74    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC Score on Test Set: 0.8737
