In [25]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pickle


In [26]:
# Loading the datasets from the /content/ directory in Google Colab (adjust paths if necessary)
train_data = pd.read_csv('/content/fraudTrain.csv')
test_data = pd.read_csv('/content/fraudTest.csv')

# Preview the datasets
print(train_data.head())
print(test_data.head())


   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [27]:
# Check the data types and missing values
print(train_data.info())
print(test_data.info())

# Check for missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())

# Check class balance for fraud and legitimate transactions
print(train_data['is_fraud'].value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [28]:
# Dropping irrelevant columns
columns_to_drop = ['trans_date_trans_time', 'merchant', 'first', 'last', 'cc_num',
                   'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 'unix_time',
                   'merch_lat', 'merch_long', 'job', 'trans_num']
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)


# Identifying categorical columns that need to be encoded
categorical_columns = ['category', 'gender']
# Label Encoding for categorical columns
le = LabelEncoder()
for col in categorical_columns:
    train_data[col] = le.fit_transform(train_data[col].astype(str))  # Ensure all values are strings before encoding
    test_data[col] = le.transform(test_data[col].astype(str))

# Scale the 'amt' column
scaler = StandardScaler()
train_data['amt'] = scaler.fit_transform(train_data[['amt']])
test_data['amt'] = scaler.transform(test_data[['amt']])

# Check the preprocessed data
print(train_data.head())



   Unnamed: 0  category       amt  gender  city_pop  is_fraud
0           0         8 -0.407826       0      3495         0
1           1         4  0.230039       0       149         0
2           2         0  0.934149       1      4154         0
3           3         2 -0.158132       1      1939         0
4           4         9 -0.177094       1        99         0


In [29]:
# Defining features (X) and target (y)
X_train = train_data.drop(columns=['is_fraud'])  # Features
y_train = train_data['is_fraud']  # Target

X_test = test_data.drop(columns=['is_fraud'])
y_test = test_data['is_fraud']


In [30]:
# Build and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [31]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Define a function to calculate and print evaluation metrics
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print(f'AUC-ROC: {auc:.4f}')

# Evaluate the model performance
evaluate_model(y_test, y_pred)


Accuracy: 0.9976
Precision: 0.7371
Recall: 0.5869
F1-Score: 0.6535
AUC-ROC: 0.7931


In [None]:
# Save the trained model
with open('fraud_detection_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
