In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import os
import requests
import pandas as pd

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=acea29fdecf0c96162abab418b403eefaf3dfc14d903b545a083ad02c2b68330
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [45]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m821.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.14.3
    Uninstalling openai-1.14.3:
      Successfully uninstalled openai-1.14.3
Successfully installed openai-0.28.0


In [4]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [23]:
os.makedirs("/content/drive/MyDrive/ML_Projects/Credit_Crd_Fraud/data", exist_ok = True)

In [5]:
url = "https://raw.githubusercontent.com/VeryFatBoy/gpt-workshop/main/data/creditcard.csv"
response = requests.get(url)
with open("/content/drive/MyDrive/ML_Projects/Credit_Crd_Fraud/data/creditcard.csv", "wb") as f:
    f.write(response.content)

In [3]:
pdf = pd.read_csv("/content/drive/MyDrive/ML_Projects/Credit_Crd_Fraud/data/creditcard.csv")

In [11]:
pdf.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,69.0,0.298503,2.142722,-1.542143,1.561332,0.938108,-2.145673,1.405569,-0.778459,0.328636,...,-0.290987,-0.000235,0.092174,0.586205,-0.397314,-0.480949,0.251145,-0.242279,0.78,0
1,239.0,1.167527,-0.318288,0.502464,0.033904,-0.584285,0.024592,-0.413738,0.186412,0.605945,...,-0.040544,0.092396,-0.038321,0.115915,0.319733,1.125327,-0.058855,-0.014192,10.0,0


In [15]:
pdf.shape

(3265, 31)

In [16]:
pdf.groupby("Class").size()

Class
0    2773
1     492
dtype: int64

In [17]:
pdf["Amount"].describe()

count    3265.000000
mean       86.715210
std       195.568876
min         0.000000
25%         4.490000
50%        21.900000
75%        80.310000
max      2917.640000
Name: Amount, dtype: float64

In [19]:
fig = px.scatter(
    pdf,
    y = "Amount",
    color = pdf["Class"].astype(str),
    hover_data = ["Amount"]
)

fig.update_layout(
    # yaxis_type = "log",
    title = "Amount and Class"
)

fig.show()

In [21]:
fig = px.histogram(
    pdf,
    x = "Amount",
    nbins = 50
)

fig.update_layout(title="Count of Amounts range")

fig.show()

## Logistic Regression with scikit-learn

In [25]:
# Split the data into features and labels
features = pdf.iloc[:, 1:30]
labels = pdf.iloc[:, 30]

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size = 0.3,
    random_state = 42
)

# Train the logistic regression model
train_model = LogisticRegression(max_iter = 1000)
train_model.fit(train_features, train_labels)

# Make predictions on the test set
predicted_labels = train_model.predict(test_features)

In [28]:
# Generate and plot the confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

fig = px.imshow(
    cm,
    x = ["Genuine (0)", "Fraudulent (1)"],
    y = ["Genuine (0)", "Fraudulent (1)"],
    color_continuous_scale = "Reds",
    labels = dict(x = "Predicted Label", y = "True Label")
)

# Add annotations to the heatmap
for i in range(len(cm)):
    for j in range(len(cm)):
        fig.add_annotation(
            x = j,
            y = i,
            text = str(cm[i][j]),
            font = dict(color = "white" if cm[i][j] > cm.max() / 2 else "black"),
            showarrow = False
        )

fig.update_layout(
    title = "Confusion Matrix - Logistic Regression (scikit-learn)"
)

fig.show()

In [29]:
# Calculate and print the accuracy, precision, recall and f1 of the model
report = classification_report(test_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       830
           1       0.92      0.88      0.90       150

    accuracy                           0.97       980
   macro avg       0.95      0.93      0.94       980
weighted avg       0.97      0.97      0.97       980




# Install Apache Spark

In [33]:
# Create the spark session
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sdf = spark.createDataFrame(pdf)

# Select features and labels
features = sdf.columns[1:30]
labels = "Class"

# Assemble features into vector
assembler = VectorAssembler(inputCols = features, outputCol = "features")
sdf = assembler.transform(sdf).select("features", labels)

# Split the data into training and testing sets
train, test = sdf.cache().randomSplit([0.7, 0.3], seed = 42)

# Initialise logistic regression model
lr = LogisticRegression(
    maxIter = 1000,
    featuresCol = "features",
    labelCol = labels
)

In [34]:
# Train the logistic regression model
train_model = lr.fit(train)

# Make predictions on the test set
predictions = train_model.transform(test)

In [35]:
# Calculate the accuracy, precision, recall and f1 of the model
accuracy = predictions.filter(predictions.Class == predictions.prediction).count() / float(test.count())

evaluator = MulticlassClassificationEvaluator(
    labelCol = labels,
    predictionCol = "prediction",
    metricName = "precisionByLabel"
)
precision = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(
    labelCol = labels,
    predictionCol = "prediction",
    metricName = "recallByLabel"
)
recall = evaluator.evaluate(predictions)

evaluator = MulticlassClassificationEvaluator(
    labelCol = labels,
    predictionCol = "prediction",
    metricName = "fMeasureByLabel"
)
f1 = evaluator.evaluate(predictions)

In [36]:
# Create confusion matrix
cm = predictions.select("Class", "prediction")
cm = cm.groupBy("Class", "prediction").count()
cm = cm.toPandas()

# Pivot the confusion matrix
cm = cm.pivot(
    index = "Class",
    columns = "prediction",
    values = "count"
)

# Generate and plot the confusion matrix
fig = px.imshow(
    cm,
    x = ["Genuine (0)", "Fraudulent (1)"],
    y = ["Genuine (0)", "Fraudulent (1)"],
    color_continuous_scale = "Reds",
    labels = dict(x = "Predicted Label", y = "True Label")
)

# Add annotations to the heatmap
for i in range(len(cm)):
    for j in range(len(cm)):
        fig.add_annotation(
            x = j,
            y = i,
            text = str(cm.iloc[i, j]),
            font = dict(color = "white" if cm.iloc[i, j] > cm.values.max() / 2 else "black"),
            showarrow = False
        )
fig.update_layout(title_text = "Confusion Matrix - Logistic Regression (Spark)")

fig.show()

In [37]:
# Print the accuracy, precision, recall and f1 of the model
print("Accuracy: %.4f" % accuracy)
print("Precision: %.4f" % precision)
print("Recall: %.4f" % recall)
print("F1: %.4f" % f1)

Accuracy: 0.9817
Precision: 0.9862
Recall: 0.9924
F1: 0.9893


# OpenAI Implementation

In [4]:
new_pdf = pdf.copy()

In [5]:
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:··········


In [6]:
import openai

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

@retry(wait = wait_random_exponential(min = 1, max = 60),
       stop = stop_after_attempt(6)
)

def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [7]:
data = new_pdf.values.tolist()

# Split the data into features and labels
features = [[float(cell) for cell in row[1:29]] + [float(row[29])] for row in data]
labels = [int(row[-1]) for row in data]

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size = 0.3,
    random_state = 42
)

In [8]:
# raise KeyboardInterrupt("Execution stopped manually.")

# Train the model
train_model = "gpt-3.5-turbo-instruct"
train_prompt = (
    f"Train a model to classify transactions as fraudulent (1) or not fraudulent (0) based on their features.\n"
    f"Each transaction is represented by 28 features and labeled as 0 (not fraudulent) or 1 (fraudulent).\n"
    f"Use the GPT-3.5 API to classify the transactions.\n"
)

# Send initial prompt to train the model
train_model_response = completion_with_backoff(
    engine = train_model,
    prompt = train_prompt,
    temperature = 0,
    max_tokens = 500,
    n = 1,
    stop = None,
    timeout = 30,
)


In [9]:
train_model_id = train_model_response.model

# Define batch size for processing multiple transactions at once
batch_size = 10

# Split the transactions into batches
transaction_batches = [train_features[i:i+batch_size] for i in range(0, len(train_features), batch_size)]
label_batches = [train_labels[i:i+batch_size] for i in range(0, len(train_labels), batch_size)]

# Iterate over batches
for features_batch, labels_batch in zip(transaction_batches, label_batches):
    # Construct prompt for batch
    prompt = ""
    for features, label in zip(features_batch, labels_batch):
        prompt += (f"Train the model to classify the transaction with the following \n"
                   f"label: {label}, with features: {features}\n")


In [10]:
    # Send request to the model
    response = completion_with_backoff(
        engine = train_model_id,
        prompt = prompt,
        temperature = 0,
        max_tokens = 30 * len(features_batch),
        n = 1,
        stop = None,
        timeout = 30,
    )

In [12]:
#raise KeyboardInterrupt("Execution stopped manually.")

# Evaluate the model
test_model = train_model
test_model_prompt = (
    f"Classify whether the transaction with the following features is \n"
    f"either not fraudulent or fraudulent."
)

test_model_response = completion_with_backoff(
    engine = train_model_id,
    prompt = test_model_prompt,
    temperature = 0,
    max_tokens = 30,
    n = 1,
    stop = None,
    timeout = 30,
)

test_model_id = test_model_response.model

predicted_labels = []
for i in range(len(test_features)):
    prompt = (f"Classify whether the transaction with the following features is \n"
              f"either not fraudulent or fraudulent: {test_features[i]}"
    )
    response = completion_with_backoff(
        engine = test_model_id,
        prompt = prompt,
        temperature = 0,
        max_tokens = 30,
        n = 1,
        stop = None,
        timeout = 30,
    )

    predicted_label = response.choices[0].text.strip().lower().replace(".", "")
    binary_label = 1 if predicted_label == "fraudulent" else 0
    predicted_labels.append(binary_label)#

In [13]:
# Generate and plot the confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)
fig = px.imshow(
    cm,
    x = ["Genuine (0)", "Fraudulent (1)"],
    y = ["Genuine (0)", "Fraudulent (1)"],
    color_continuous_scale = "Reds",
    labels = dict(x = "Predicted Label", y = "True Label")
)

# Add annotations to the heatmap
for i in range(len(cm)):
    for j in range(len(cm)):
        fig.add_annotation(
            x = j,
            y = i,
            text = str(cm[i][j]),
            font = dict(color = "white" if cm[i][j] > cm.max() / 2 else "black"),
            showarrow = False
        )

fig.update_layout(title = "Confusion Matrix - OpenAI Model")

fig.show()

In [14]:
# Calculate and print the accuracy, precision, recall and f1 of the model
report = classification_report(test_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.35      0.50       830
           1       0.18      0.81      0.30       150

    accuracy                           0.42       980
   macro avg       0.55      0.58      0.40       980
weighted avg       0.80      0.42      0.47       980

