In [189]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [190]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [191]:
df = pd.read_csv('/content/onlinefraud.csv')

In [None]:
df.head()

# Dataset Description

The dataset consists of 10 variables, which are described below:

1. **step**: Represents a unit of time where 1 step equals 1 hour.
2. **type**: The type of online transaction.
3. **amount**: The amount of the transaction.
4. **nameOrig**: Customer starting the transaction.
5. **oldbalanceOrg**: Balance of the customer before the transaction.
6. **newbalanceOrig**: Balance of the customer after the transaction.
7. **nameDest**: Recipient of the transaction.
8. **oldbalanceDest**: Initial balance of the recipient before the transaction.
9. **newbalanceDest**: The new balance of the recipient after the transaction.
10. **isFraud**: Indicates whether the transaction was fraudulent (1 for fraud, 0 for non-fraud).


In [None]:
df[df['isFraud'] == 1]

In [None]:
df.info()

In [195]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [196]:
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [197]:
df = df.dropna()

In [198]:
# Check duplicate values
df.duplicated().sum()


0

#Univariate Analysis

In [None]:
sns.countplot(data=df, x='type')
plt.title('Transaction Types')
plt.show()


In [None]:
sns.histplot(data=df, x='amount', bins=30, kde=True)
plt.title('Transaction Amount Distribution')
plt.show()


#Bivariate analysis

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.boxplot(data=df, x='type', y='amount')
plt.title('Transaction Amount by Type')
plt.show()


In [None]:
fraud_counts = df.groupby(['type', 'isFraud']).size().reset_index(name='counts')
sns.barplot(data=fraud_counts, x='type', y='counts', hue='isFraud')
plt.title('Fraud Transactions by Type')
plt.show()


In [None]:
categorical_features = df.select_dtypes(include=['object']).columns

In [None]:
from sklearn.preprocessing import LabelEncoder
# Step 1: Preprocessing
# Encode categorical columns using LabelEncoder
label_encoder = {}
for column in categorical_features:
    label_encoder[column] = LabelEncoder()
    df[column] = label_encoder[column].fit_transform(df[column])

In [None]:
df.head()

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Define random seed for reproducibility
seed = 42

#The line below had an extra space before it causing the error. I have removed it.
X = df.drop(columns=['isFraud'])
y = df['isFraud']

# Initialize Stratified K-Fold
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# Define models to evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=seed),
    "Random Forest": RandomForestClassifier(random_state=seed),
}

# Store results for each fold and each model
results = {}

# Start K-Fold process
for fold_idx, (train_idx, test_idx) in enumerate(skfold.split(X, y), start=1):
    print(f"--- Fold {fold_idx} ---")

    # Split data into training and testing sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Scale the data
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(X_train)
    scaled_test = scaler.transform(X_test)
    X_train = pd.DataFrame(scaled_train, index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaled_test, index=X_test.index, columns=X_test.columns)

    # Apply RandomUnderSampler to balance the training data
    rus = RandomUnderSampler(sampling_strategy='majority', random_state=seed)
    X_train, y_train = rus.fit_resample(X_train, y_train)

    # Train and evaluate each model
    for model_name, model in models.items():
        print(f"Training {model_name} on Fold {fold_idx}...")

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} Accuracy on Fold {fold_idx}: {accuracy:.2f}")
        print(f"Classification Report for {model_name}:\n")
        print(classification_report(y_test, y_pred))
        print("-" * 50)

        # Store results
        if model_name not in results:
            results[model_name] = []
        results[model_name].append(accuracy)

# Calculate and display average accuracy for each model
print("\n--- Final Results ---")
for model_name, accuracies in results.items():
    avg_accuracy = sum(accuracies) / len(accuracies)
    print(f"{model_name} Average Accuracy: {avg_accuracy:.2f}")

In [None]:
df.head()

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:

# Step 6: Save the Model, Scaler, and Encoder
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

print("Model, Scaler, and Encoder saved successfully!")

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

# New data for prediction (replace with actual new data)
new_data = pd.DataFrame({
    'step': [0],
    'type': ['PAYMENT'],  # This should be one of the categories in your original data
    'amount': [9839.64],
    'nameOrig': ['C1231006815'],  # This should be a valid customer name (previously encoded)
    'oldbalanceOrg': [170136.0],
    'newbalanceOrig': [160296.36],
    'nameDest': ['M1979787155'],  # This should be a valid destination name (previously encoded)
    'oldbalanceDest': [0.0],
    'newbalanceDest': [0.0],
    'isFlaggedFraud': [0]
})

# Step 1: Preprocess the new data using LabelEncoder

# Initialize label encoders for categorical columns
label_encoder_type = LabelEncoder()
label_encoder_nameOrig = LabelEncoder()
label_encoder_nameDest = LabelEncoder()

# Fit the label encoders to the unique values in the training data (or fit on the new data directly if no training data is available)

# Encoding categorical columns in the new data
new_data['type'] = label_encoder_type.fit_transform(new_data['type'])
new_data['nameOrig'] = label_encoder_nameOrig.fit_transform(new_data['nameOrig'])
new_data['nameDest'] = label_encoder_nameDest.fit_transform(new_data['nameDest'])

# Step 2: Ensure the column order is the same as the training data
new_data = new_data[['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']]

# Step 3: Scale the new data using the saved scaler
# Load the saved scaler
with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

new_data_scaled = scaler.transform(new_data)

# Step 4: Load the saved model
with open('model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Step 5: Predict using the loaded model
predictions = model.predict(new_data_scaled)

# Step 6: Decode predictions if necessary (if the target was encoded)
decoded_predictions = predictions
print("Predictions for new dataset:", decoded_predictions)
