## **Financial Fraud Detection in Transaction Notes**
Course: FA I/II – AI 574, Section 001: Natural Language Processing (2025)
Professor: Youakim Badr, Ph.D.
Team: John J. Pretz (jjp6846@psu.edu), Sahoo, Manas Ranjan (mr.sahoo@psu.edu)

Goal:
Detect fraudulent transactions by analyzing transaction note text using NLP and Deep Learning.

In [1]:
!pip install datasets




In [None]:
!pip install transformers datasets torch

In [3]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("TheFinAI/fiqa-sentiment-classification")

# Peek at the structure
print(dataset)

# Convert to Pandas DataFrame
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

print(df_train.head())


DatasetDict({
    train: Dataset({
        features: ['_id', 'sentence', 'target', 'aspect', 'score', 'type'],
        num_rows: 822
    })
    test: Dataset({
        features: ['_id', 'sentence', 'target', 'aspect', 'score', 'type'],
        num_rows: 234
    })
    valid: Dataset({
        features: ['_id', 'sentence', 'target', 'aspect', 'score', 'type'],
        num_rows: 117
    })
})


KeyError: 'validation'

In [None]:

import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score



# Preprocessing

def clean_text(text):
    """Basic text cleaning for transaction notes."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # remove URLs
    text = re.sub(r'\S+@\S+', ' ', text)                # remove emails
    text = re.sub(r'[^a-z0-9\s\$\.\,\-\_]', ' ', text)  # keep numbers & $ . , - _
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# Load Dataset

# Example: CSV file must have two columns: "text" (transaction notes), "label" (0=legit, 1=fraud)
df = pd.read_csv("transactions.csv")
df["text"] = df["text"].fillna("").astype(str).apply(clean_text)

print("Dataset shape:", df.shape)
print(df.head())


# Split Data

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)



# Feature Extraction

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# Model Training

model = LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
model.fit(X_train_tfidf, y_train)



# Evaluation

y_pred = model.predict(X_test_tfidf)
y_proba = model.predict_proba(X_test_tfidf)[:, 1]

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, digits=4))

roc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC: {roc:.4f}")



# Example Predictions

sample_notes = [
    "Payment to grocery store",
    "Refund issued to unknown account",
    "Wire transfer $5000 overseas urgent",
    "Salary credited",
]

sample_clean = [clean_text(t) for t in sample_notes]
sample_vec = tfidf.transform(sample_clean)
preds = model.predict(sample_vec)
print("\n--- Sample Predictions ---")
for note, label in zip(sample_notes, preds):
    print(f"{note} --> {'FRAUD' if label==1 else 'LEGIT'}")


In [None]:
# ========================================
# Financial Sentiment Analysis - FiQA Baseline
# Using TF-IDF + Logistic Regression
# ========================================

import re
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# -------------------------------
# 1. Load FiQA Dataset
# -------------------------------
print("Loading FiQA Sentiment dataset...")
dataset = load_dataset("TheFinAI/fiqa-sentiment-classification")

# Convert to Pandas DataFrames
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

print("Train size:", len(df_train))
print("Validation size:", len(df_val))
print("Test size:", len(df_test))
print(df_train.head())

# -------------------------------
# 2. Preprocessing Labels
# Sentiment score ∈ [-1, 1]
# Map to: Negative=0, Neutral=1, Positive=2
# -------------------------------
def map_sentiment(score):
    if score < -0.05:
        return 0  # Negative
    elif score > 0.05:
        return 2  # Positive
    else:
        return 1  # Neutral

df_train["label"] = df_train["sentiment_score"].apply(map_sentiment)
df_val["label"] = df_val["sentiment_score"].apply(map_sentiment)
df_test["label"] = df_test["sentiment_score"].apply(map_sentiment)

# Merge train + validation
df_all_train = pd.concat([df_train, df_val], ignore_index=True)

# -------------------------------
# 3. Feature Extraction with TF-IDF
# -------------------------------
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # unigrams + bigrams
    max_features=30000,  # limit vocab size
    stop_words="english"
)

X_train = vectorizer.fit_transform(df_all_train["sentence"])
y_train = df_all_train["label"]

X_test = vectorizer.transform(df_test["sentence"])
y_test = df_test["label"]

print("TF-IDF Features Shape:", X_train.shape)

# -------------------------------
# 4. Train Logistic Regression
# -------------------------------
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear"
)
model.fit(X_train, y_train)

# -------------------------------
# 5. Evaluation
# -------------------------------
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, digits=4, target_names=["Negative", "Neutral", "Positive"]))

# Calculate macro ROC-AUC
try:
    roc = roc_auc_score(y_test, y_proba, multi_class="ovr")
    print(f"ROC-AUC: {roc:.4f}")
except Exception as e:
    print("ROC-AUC not available:", e)

# -------------------------------
# 6. Sample Predictions
# -------------------------------
sample_sentences = [
    "The company reported record profits this quarter.",
    "Market volatility continues to concern investors.",
    "Revenue remained stable compared to last year."
]

sample_vec = vectorizer.transform(sample_sentences)
sample_preds = model.predict(sample_vec)

print("\n--- Sample Predictions ---")
for sent, label in zip(sample_sentences, sample_preds):
    label_str = {0: "Negative", 1: "Neutral", 2: "Positive"}[label]
    print(f"{sent} --> {label_str}")


In [None]:
from datasets import load_dataset
import pandas as pd

# Load the FiQA dataset
dataset = load_dataset("TheFinAI/fiqa-sentiment-classification")

# Convert to Pandas DataFrames
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

# Map sentiment scores to categories
def map_sentiment(score):
    if score < -0.05:
        return 0  # Negative
    elif score > 0.05:
        return 2  # Positive
    else:
        return 1  # Neutral

df_train["label"] = df_train["sentiment_score"].apply(map_sentiment)
df_val["label"] = df_val["sentiment_score"].apply(map_sentiment)
df_test["label"] = df_test["sentiment_score"].apply(map_sentiment)

# Merge train and validation sets for training
df_all_train = pd.concat([df_train, df_val], ignore_index=True)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Create a sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:
sample_sentences = [
    "The company reported record profits this quarter.",
    "Market volatility continues to concern investors.",
    "Revenue remained stable compared to last year."
]

# Get predictions for sample sentences
sample_preds = nlp(sample_sentences)

for sentence, pred in zip(sample_sentences, sample_preds):
    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {pred['label']} (Confidence: {pred['score']:.4f})\n")
