# CREDIT CARD FRAUD DETECTION

Task 2

Build a model to detect fraudulent credit card transactions. Use a
dataset containing information about credit card transactions, and
experiment with algorithms like Logistic Regression, Decision Trees,
or Random Forests to classify transactions as fraudulent or legitimate.

In [62]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [63]:
# Load a smaller subset of train and test datasets
train_data = pd.read_csv('fraudTrain.csv').sample(frac=0.1, random_state=42)
test_data = pd.read_csv('fraudTest.csv').sample(frac=0.1, random_state=42)
print("Test data:", train_data.head(10))
print("---------------------------------------------")
print("Train data:", test_data.head(10))

Test data:          Unnamed: 0 trans_date_trans_time               cc_num  \
1045211     1045211   2020-03-09 15:09:26         577588686219   
547406       547406   2019-08-22 15:49:01       30376238035123   
110142       110142   2019-03-04 01:34:16     4658490815480264   
1285953     1285953   2020-06-16 20:04:38     3514897282719543   
271705       271705   2019-05-14 05:54:48     6011381817520024   
391181       391181   2019-06-28 10:46:05  4855488158131690372   
962652       962652   2020-01-23 09:09:59       30427035050508   
974821       974821   2020-01-30 11:23:44      213120463918358   
127056       127056   2019-03-11 10:35:53     6597888193422452   
1122630     1122630   2020-04-12 18:50:07        4809701904914   

                                            merchant        category     amt  \
1045211                              fraud_Towne LLC        misc_pos  194.51   
547406                             fraud_Friesen Ltd  health_fitness   52.32   
110142                

In [64]:
# Extract date-related features for both train and test datasets
for data in [train_data, test_data]:
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
    data['trans_hour'] = data['trans_date_trans_time'].dt.hour
    data['trans_day'] = data['trans_date_trans_time'].dt.day
    data['trans_month'] = data['trans_date_trans_time'].dt.month

In [65]:
# Drop unnecessary columns for both train and test datasets
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'state', 'zip',
                   'job', 'dob', 'trans_num', 'unix_time']
for data in [train_data, test_data]:
    data.drop(columns=columns_to_drop, axis=1, inplace=True)

In [66]:
# Encode categorical variables using Label Encoding for both train and test datasets
label_encoder = LabelEncoder()
for data in [train_data, test_data]:
    data['category'] = label_encoder.fit_transform(data['category'])
    data['gender'] = label_encoder.fit_transform(data['gender'])


In [67]:
# Standardize numerical features for both train and test datasets
numerical_features = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'trans_hour', 'trans_day',
                      'trans_month']
scaler = StandardScaler()
for data in [train_data, test_data]:
    data[numerical_features] = scaler.fit_transform(data[numerical_features])



In [68]:
# Separate features (X) and labels (y) for both train and test datasets
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']



In [69]:
# Train Random Forest model
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)



In [70]:
# Make predictions on the test set
predictions_rf = model_rf.predict(X_test)


In [None]:
# Map predictions to 'Fraudulent' or 'Legitimate'
predicted_labels = ['Fraudulent' if prediction == 1 else 'Legitimate' for prediction in predictions_rf]


In [71]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions_rf)
confusion_mat = confusion_matrix(y_test, predictions_rf)
classification_rep = classification_report(y_test, predictions_rf)


In [59]:
print(":----------------------------------------------------------------------------------------------:")
print(f"Random Forest Model")
print(f"Accuracy: {accuracy:.4f}")  # four numbers after the decimal point
print(" ")
print(f"Confusion Matrix:\n{confusion_mat}")
print(" ")
print(f"Classification Report:\n{classification_rep}")
print(" ")
print(f"Predicted Labels:\n{predicted_labels}")


:----------------------------------------------------------------------------------------------:
Random Forest Model
Accuracy: 0.9979
 
Confusion Matrix:
[[55339    17]
 [  100   116]]
 
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55356
           1       0.87      0.54      0.66       216

    accuracy                           1.00     55572
   macro avg       0.94      0.77      0.83     55572
weighted avg       1.00      1.00      1.00     55572

 
Predicted Labels:
['Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitimate', 'Legitim