In [38]:
import pandas as pd

# Load the CSV
df = pd.read_csv('creditcard.csv')
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [39]:
print(df.info())
print(df.describe())
print(df['Class'].value_counts())  # 1 = Fraud, 0 = Legit


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [40]:
# Separate fraud and non-fraud
fraud = df[df["Class"] == 1]
non_fraud = df[df["Class"] == 0].sample(n=492, random_state=42)  # undersample normal transactions

# Combine to create a balanced dataset
balanced_df = pd.concat([fraud, non_fraud]).sample(frac=1, random_state=42)  # shuffle

print("New balanced dataset shape:", balanced_df.shape)
print(balanced_df["Class"].value_counts())


New balanced dataset shape: (984, 31)
Class
0    492
1    492
Name: count, dtype: int64


In [41]:
from sklearn.preprocessing import StandardScaler

# Create scaler
scaler = StandardScaler()

# Scale 'Amount' and 'Time'
balanced_df[['scaled_amount', 'scaled_time']] = scaler.fit_transform(balanced_df[['Amount', 'Time']])

# Drop original 'Amount' and 'Time' columns
balanced_df.drop(['Amount', 'Time'], axis=1, inplace=True)

# Optional: Reorder columns (scaled columns at the front)
scaled_cols = ['scaled_amount', 'scaled_time']
other_cols = [col for col in balanced_df.columns if col not in scaled_cols + ['Class']]
balanced_df = balanced_df[scaled_cols + other_cols + ['Class']]

balanced_df.head()


Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
255545,0.084458,1.446363,1.984787,-1.937036,0.486613,-1.245536,-2.518536,-0.544524,-1.819244,-0.074875,...,-0.127858,0.174011,0.889289,0.218248,0.492384,-0.584599,-0.13504,0.070319,-0.000205,0
248296,-0.456932,1.375419,-0.613696,3.698772,-5.534941,5.620486,1.649263,-2.335145,-0.907188,0.706362,...,0.354773,0.319261,-0.471379,-0.07589,-0.667909,-0.642848,0.0706,0.48841,0.292345,1
76163,1.726673,-0.656191,0.319007,-1.072867,-0.216146,1.494709,-0.627063,-0.761867,0.941687,-0.430272,...,0.914995,0.332521,-0.022461,-0.537691,0.452072,0.555495,-0.383543,-0.06852,0.106578,0
239501,0.613485,1.297532,-6.682832,-2.714268,-5.77453,1.449792,-0.661836,-1.14865,0.849686,0.433427,...,-1.928527,0.220526,1.187013,0.335821,0.215683,0.80311,0.044033,-0.054988,0.082337,1
143336,0.684137,-0.054512,-6.713407,3.921104,-9.746678,5.148263,-5.151563,-2.099389,-5.937767,3.57878,...,0.135711,0.954272,-0.451086,0.127214,-0.33945,0.394096,1.075295,1.649906,-0.394905,1


In [42]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = balanced_df.drop('Class', axis=1)
y = balanced_df['Class']

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)


Training Set Shape: (787, 30)
Testing Set Shape: (197, 30)


In [43]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [44]:
# Fit the model to training data
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Import evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Evaluate performance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[95  4]
 [ 8 90]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94        99
           1       0.96      0.92      0.94        98

    accuracy                           0.94       197
   macro avg       0.94      0.94      0.94       197
weighted avg       0.94      0.94      0.94       197


Accuracy Score: 0.9390862944162437


In [45]:
import numpy as np
import pandas as pd

# Ensure predictions are done
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of being fraud

# Add predictions and risk score to the test set
X_test_copy = X_test.copy()
X_test_copy['Prediction'] = y_pred
X_test_copy['Fraud_Probability'] = y_proba
X_test_copy['Risk_Score'] = y_proba * X_test_copy['scaled_amount']  # You can change 'scaled_amount' if needed

# Export for Tableau
X_test_copy.to_csv("fraud_predictions.csv", index=False)


In [46]:
fraudpsee=pd.read_csv("fraud_predictions.csv")
fraudpsee.head()


Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V22,V23,V24,V25,V26,V27,V28,Prediction,Fraud_Probability,Risk_Score
0,0.174238,-0.960002,-19.139733,9.286847,-20.134992,7.818673,-15.652208,-1.668348,-21.340478,0.6419,...,0.520543,-0.760556,0.662767,-0.948454,0.121796,-3.381843,-1.256524,1,1.0,0.174238
1,-0.452421,0.306859,-2.488363,4.359019,-7.77641,5.364027,-1.823877,-2.44514,-4.964221,1.48489,...,1.021226,-0.266476,-0.37088,0.365535,0.081372,0.184983,-0.211582,1,1.0,-0.452421
2,-0.02382,-1.033406,-0.730779,-0.075013,2.316032,-1.902745,-0.551713,-0.037457,0.541981,-0.058828,...,0.217508,0.000121,0.1955,0.143696,-0.527853,-0.145276,-0.162602,0,0.03,-0.000715
3,2.805888,0.899032,0.46975,-1.237555,-1.767341,4.83349,-0.268715,-0.51276,1.140149,-0.341273,...,-0.647075,-0.373014,0.260801,-0.496566,-0.245973,-0.117858,0.144774,1,0.67,1.879945
4,-0.446601,1.485222,-0.211602,1.172305,-0.685269,-0.473799,0.510636,-0.830758,0.632354,0.243815,...,1.223319,-0.055744,0.691108,-0.743698,-0.245378,0.378114,0.264016,0,0.09,-0.040194


In [47]:
import pickle

# Save the trained model
with open("fraud_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
