In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC, SVR
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor,XGBRFRegressor,XGBClassifier
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder,MinMaxScaler
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge,ElasticNet,RidgeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,cohen_kappa_score,confusion_matrix,mean_squared_error,r2_score,\
root_mean_squared_error,recall_score,roc_auc_score,roc_curve,mean_absolute_error

from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,\
RandomForestClassifier,VotingClassifier,HistGradientBoostingClassifier

In [2]:
df = pd.read_csv('credit_card_fraud_dataset.csv')

In [3]:
df.head()

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.0,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.4,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0


In [4]:
df.columns

Index(['TransactionID', 'TransactionDate', 'Amount', 'MerchantID',
       'TransactionType', 'Location', 'IsFraud'],
      dtype='object')

In [5]:
df = df.rename(columns={'TransactionID':'transaction_id', 'TransactionDate':'transaction_date',
                        'Amount':'amount', 'MerchantID':'merchant_id','TransactionType':'transaction_type',
                        'Location':'location', 'IsFraud':'is_fraud'})

In [6]:
df.head()

Unnamed: 0,transaction_id,transaction_date,amount,merchant_id,transaction_type,location,is_fraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.0,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.4,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_id    100000 non-null  int64  
 1   transaction_date  100000 non-null  object 
 2   amount            100000 non-null  float64
 3   merchant_id       100000 non-null  int64  
 4   transaction_type  100000 non-null  object 
 5   location          100000 non-null  object 
 6   is_fraud          100000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 5.3+ MB


In [8]:
# Convert column to datetime if it's not already
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

In [9]:
# Then format it to your desired string format
df['transaction_date'] = df['transaction_date'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [10]:
# Convert column to datetime again 
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   transaction_id    100000 non-null  int64         
 1   transaction_date  100000 non-null  datetime64[ns]
 2   amount            100000 non-null  float64       
 3   merchant_id       100000 non-null  int64         
 4   transaction_type  100000 non-null  object        
 5   location          100000 non-null  object        
 6   is_fraud          100000 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 5.3+ MB


In [12]:
#df['transaction_type'].value_counts()

In [13]:
transaction = df[['transaction_id','merchant_id', 'transaction_date']]

In [14]:
df.head()

Unnamed: 0,transaction_id,transaction_date,amount,merchant_id,transaction_type,location,is_fraud
0,1,2024-04-03 14:15:35,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35,784.0,394,purchase,New York,0
3,4,2024-04-13 23:50:35,3514.4,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35,369.07,475,purchase,Phoenix,0


In [15]:
dum = pd.get_dummies(df, dtype='int')
dum.head(2)

Unnamed: 0,transaction_id,transaction_date,amount,merchant_id,is_fraud,transaction_type_purchase,transaction_type_refund,location_Chicago,location_Dallas,location_Houston,location_Los Angeles,location_New York,location_Philadelphia,location_Phoenix,location_San Antonio,location_San Diego,location_San Jose
0,1,2024-04-03 14:15:35,4189.27,688,0,0,1,0,0,0,0,0,0,0,1,0,0
1,2,2024-03-19 13:20:35,2659.71,109,0,0,1,0,1,0,0,0,0,0,0,0,0


In [16]:
dum.columns

Index(['transaction_id', 'transaction_date', 'amount', 'merchant_id',
       'is_fraud', 'transaction_type_purchase', 'transaction_type_refund',
       'location_Chicago', 'location_Dallas', 'location_Houston',
       'location_Los Angeles', 'location_New York', 'location_Philadelphia',
       'location_Phoenix', 'location_San Antonio', 'location_San Diego',
       'location_San Jose'],
      dtype='object')

In [17]:
def rename_columns(dataframe):
    """
    Renames columns that start with 'transaction_type_' or 'location_' by removing the prefix.
    
    Parameters:
    dataframe (pandas.DataFrame): The dataframe with columns to rename
    
    Returns:
    pandas.DataFrame: The dataframe with renamed columns
    """
    # Create a dictionary for column renaming
    rename_dict = {}
    # Find columns that start with 'transaction_type_' or 'location_'
    for column in dataframe.columns:
        if column.startswith('transaction_type_'):
            # Extract the part after 'transaction_type_'
            new_name = column.replace('transaction_type_', '')
            rename_dict[column] = new_name
        
        elif column.startswith('location_'):
            new_name = column.replace('location_', '')
            rename_dict[column] = new_name
    
    # Return the dataframe with renamed columns
    return dataframe.rename(columns=rename_dict)

In [18]:
new_df = rename_columns(dum)

In [19]:
new_df.head()

Unnamed: 0,transaction_id,transaction_date,amount,merchant_id,is_fraud,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose
0,1,2024-04-03 14:15:35,4189.27,688,0,0,1,0,0,0,0,0,0,0,1,0,0
1,2,2024-03-19 13:20:35,2659.71,109,0,0,1,0,1,0,0,0,0,0,0,0,0
2,3,2024-01-08 10:08:35,784.0,394,0,1,0,0,0,0,0,1,0,0,0,0,0
3,4,2024-04-13 23:50:35,3514.4,944,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,2024-07-12 18:51:35,369.07,475,0,1,0,0,0,0,0,0,0,1,0,0,0


In [20]:
new_df = new_df.drop(columns=['transaction_id','transaction_date','merchant_id'], axis=1)

In [21]:
new_df.head()

Unnamed: 0,amount,is_fraud,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose
0,4189.27,0,0,1,0,0,0,0,0,0,0,1,0,0
1,2659.71,0,0,1,0,1,0,0,0,0,0,0,0,0
2,784.0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,3514.4,0,1,0,0,0,0,0,0,1,0,0,0,0
4,369.07,0,1,0,0,0,0,0,0,0,1,0,0,0


In [22]:
new_df['is_fraud'].value_counts()

is_fraud
0    99000
1     1000
Name: count, dtype: int64

In [23]:
#new_df['amount'] = new_df['amount'].astype('int')

### Building Models

In [24]:
X = new_df.drop('is_fraud', axis=1) 
y = new_df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## KNeighborsClassifier

In [40]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)

In [41]:
knn_clf.score(X_test, y_test)

0.9893

In [42]:
knn_clf_pred = knn_clf.predict(X_test)
knn_clf_pred

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
knn_df = pd.DataFrame(X_test)
knn_df['prediction'] = knn_clf_pred
knn_df

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
75721,4031.22,0,1,0,0,0,0,0,0,1,0,0,0,0
80184,3018.81,1,0,0,0,0,0,0,0,0,0,1,0,0
19864,163.82,0,1,0,0,0,0,0,0,0,0,1,0,0
76699,2548.43,0,1,1,0,0,0,0,0,0,0,0,0,0
92991,3292.04,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42648,4268.01,1,0,0,1,0,0,0,0,0,0,0,0,0
86306,4589.96,0,1,0,0,0,0,0,0,0,0,1,0,0
45466,3362.33,1,0,0,1,0,0,0,0,0,0,0,0,0
63724,2845.50,0,1,0,0,0,0,1,0,0,0,0,0,0


In [44]:
knn_df[knn_df['prediction'] == 1]

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
82213,2304.21,1,0,0,0,0,0,0,0,0,1,0,0,1
73557,4227.02,1,0,0,0,0,0,1,0,0,0,0,0,1
25211,463.9,1,0,0,0,0,0,0,1,0,0,0,0,1
59692,3500.62,1,0,0,0,0,0,1,0,0,0,0,0,1
95947,4945.58,1,0,0,0,0,0,0,0,0,0,0,1,1
40191,4728.7,1,0,0,0,1,0,0,0,0,0,0,0,1
56079,4728.75,1,0,0,0,1,0,0,0,0,0,0,0,1


## BaggingClassifier

In [49]:
bag_clf = BaggingClassifier()
bag_clf.fit(X_train, y_train)

In [50]:
bag_clf.score(X_test, y_test)

0.9875

In [51]:
bag_clf_pred = bag_clf.predict(X_test)
bag_clf_pred

array([0, 0, 0, ..., 0, 0, 0])

In [52]:
bag_clf_df = pd.DataFrame(X_test)
bag_clf_df['prediction'] = bag_clf_pred
display(bag_clf_df)

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
75721,4031.22,0,1,0,0,0,0,0,0,1,0,0,0,0
80184,3018.81,1,0,0,0,0,0,0,0,0,0,1,0,0
19864,163.82,0,1,0,0,0,0,0,0,0,0,1,0,0
76699,2548.43,0,1,1,0,0,0,0,0,0,0,0,0,0
92991,3292.04,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42648,4268.01,1,0,0,1,0,0,0,0,0,0,0,0,0
86306,4589.96,0,1,0,0,0,0,0,0,0,0,1,0,0
45466,3362.33,1,0,0,1,0,0,0,0,0,0,0,0,0
63724,2845.50,0,1,0,0,0,0,1,0,0,0,0,0,0


In [53]:
bag_clf_df[bag_clf_df['prediction'] == 1]

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
18292,184.88,1,0,1,0,0,0,0,0,0,0,0,0,1
17972,290.50,0,1,0,1,0,0,0,0,0,0,0,0,1
66053,3812.30,1,0,0,1,0,0,0,0,0,0,0,0,1
72120,262.60,1,0,0,0,1,0,0,0,0,0,0,0,1
79817,4806.53,0,1,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58874,4804.23,0,1,0,0,1,0,0,0,0,0,0,0,1
11359,2363.21,0,1,0,0,0,0,0,0,0,0,1,0,1
90519,1705.01,0,1,0,0,0,0,0,1,0,0,0,0,1
63899,4728.07,0,1,0,0,0,0,1,0,0,0,0,0,1


## GradientBoostingClassifier

In [54]:
grad_boost_clf = GradientBoostingClassifier()
grad_boost_clf.fit(X_train, y_train)

In [55]:
grad_boost_clf.score(X_test, y_test)

0.9873

In [56]:
grad_boost_clf_pred = grad_boost_clf.predict(X_test)
grad_boost_clf_pred

array([0, 0, 0, ..., 0, 0, 0])

In [57]:
grad_boost_clf_df = pd.DataFrame(X_test)
grad_boost_clf_df['prediction'] = grad_boost_clf_pred
display(grad_boost_clf_df)

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
75721,4031.22,0,1,0,0,0,0,0,0,1,0,0,0,0
80184,3018.81,1,0,0,0,0,0,0,0,0,0,1,0,0
19864,163.82,0,1,0,0,0,0,0,0,0,0,1,0,0
76699,2548.43,0,1,1,0,0,0,0,0,0,0,0,0,0
92991,3292.04,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42648,4268.01,1,0,0,1,0,0,0,0,0,0,0,0,0
86306,4589.96,0,1,0,0,0,0,0,0,0,0,1,0,0
45466,3362.33,1,0,0,1,0,0,0,0,0,0,0,0,0
63724,2845.50,0,1,0,0,0,0,1,0,0,0,0,0,0


In [58]:
grad_boost_clf_df[grad_boost_clf_df['prediction'] == 1]

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
16821,4341.81,0,1,0,0,1,0,0,0,0,0,0,0,1
15626,2299.68,1,0,0,0,0,0,0,0,0,0,1,0,1
17972,290.50,0,1,0,1,0,0,0,0,0,0,0,0,1
49019,992.02,1,0,0,0,0,0,0,1,0,0,0,0,1
94057,1761.10,1,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91521,240.94,1,0,0,0,0,0,0,1,0,0,0,0,1
93132,2342.59,0,1,0,0,0,0,0,0,0,1,0,0,1
84396,980.87,1,0,0,0,0,1,0,0,0,0,0,0,1
63684,4767.93,0,1,0,0,0,0,0,0,1,0,0,0,1


## RandomForestClassifier

In [59]:
# Define parameter grid
param_grid = {'n_estimators': [1, 10, 20, 30, 40, 50, 100]}

# Create and fit model
random_forest = RandomForestClassifier()
grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

Best parameters: {'n_estimators': 10}
Best score: 0.9841


In [60]:
rf_clf = RandomForestClassifier(n_estimators=10)
rf_clf.fit(X_train, y_train)

In [61]:
rf_clf.score(X_test, y_test)

0.9832

In [62]:
rf_clf_pred = rf_clf.predict(X_test)
rf_clf_pred

array([0, 0, 0, ..., 0, 0, 0])

In [63]:
rf_df = pd.DataFrame(X_test)
rf_df['prediction'] = rf_clf_pred
display(rf_df)

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
75721,4031.22,0,1,0,0,0,0,0,0,1,0,0,0,0
80184,3018.81,1,0,0,0,0,0,0,0,0,0,1,0,0
19864,163.82,0,1,0,0,0,0,0,0,0,0,1,0,0
76699,2548.43,0,1,1,0,0,0,0,0,0,0,0,0,0
92991,3292.04,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42648,4268.01,1,0,0,1,0,0,0,0,0,0,0,0,0
86306,4589.96,0,1,0,0,0,0,0,0,0,0,1,0,0
45466,3362.33,1,0,0,1,0,0,0,0,0,0,0,0,0
63724,2845.50,0,1,0,0,0,0,1,0,0,0,0,0,0


In [64]:
rf_df[rf_df['prediction'] == 1]

Unnamed: 0,amount,purchase,refund,Chicago,Dallas,Houston,Los Angeles,New York,Philadelphia,Phoenix,San Antonio,San Diego,San Jose,prediction
73691,150.12,1,0,0,0,1,0,0,0,0,0,0,0,1
52925,3872.70,1,0,0,1,0,0,0,0,0,0,0,0,1
270,732.16,0,1,0,0,0,0,0,0,0,0,1,0,1
73217,1101.59,0,1,0,0,0,0,0,0,0,0,1,0,1
75961,1682.46,0,1,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93304,3731.52,0,1,0,1,0,0,0,0,0,0,0,0,1
65863,4756.59,0,1,0,0,1,0,0,0,0,0,0,0,1
84907,4183.08,1,0,0,0,0,0,0,0,1,0,0,0,1
96560,1437.63,0,1,0,1,0,0,0,0,0,0,0,0,1


In [74]:
print(f'The best model is: {rf_clf} with a score of {rf_clf.score(X_test, y_test)}')

The best model is: RandomForestClassifier(n_estimators=10) with a score of 0.9832
