In [1]:
import ollama
import pandas as pd
import random
import re
from langchain_community.llms import Ollama
from langchain.schema import AIMessage, HumanMessage
from tqdm import tqdm
import os

In [2]:
# Load the LLM model using LangChain and Ollama
MODEL_NAME = "deepseek-r1:1.5b"  # "llama3.2" , "deepseek-r1:1.5b" , "deepseek-r1:14b"
llm = Ollama(model=MODEL_NAME)

  llm = Ollama(model=MODEL_NAME)


In [3]:
def clean_message(raw_text):
    """Removes the AI's thought process enclosed in <think>...</think> tags."""
    return re.sub(r"<think>.*?</think>", "", raw_text, flags=re.DOTALL).strip()

In [4]:
# Load and preprocess spam dataset
spam_data = pd.read_csv('../../Data/Raw/english_sms.csv', encoding='latin1')
spam_data = spam_data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})

def load_existing_data():
    """Loads spam and ham messages from the dataset."""
    try:
        ham_samples = spam_data[spam_data["label"] == "ham"]["message"].tolist()
        spam_samples = spam_data[spam_data["label"] == "spam"]["message"].tolist()
        return ham_samples, spam_samples
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], []

In [5]:
def generate_sms(category="ham", num_samples=10, temperature=0.65, progress_bar=None):
    """Generates synthetic SMS messages based on real examples."""
    ham_examples, spam_examples = load_existing_data()

    messages = []
    for _ in range(num_samples):
        example = random.choice(spam_examples if category == "spam" else ham_examples)
        
        # Separate prompts for ham and spam
        if category == "ham":
            prompt = (
                f"You are an AI assistant that generates a **realistic ham SMS message**. "
                f"The message should be similar to: \"{example}\""
                "\n### Guidelines:\n"
                "1. Keep the message **short** (under 25 words) and **natural**.\n"
                "2. Use **casual, conversational language** that fits a real SMS between friends or family.\n"
                "3. Avoid using placeholders like [Name] or [Date]; instead, use **common names and realistic references**.\n"
                "4. The message should feel like a **genuine** conversation, with personal, relatable content.\n"
                "5. Do **not** include meta-comments, explanations, or unnecessary details.\n"
                "6. Wrap the generated message in **double quotes**, and do not add extra text.\n"
                "\nNow, generate the SMS message."
            )
        elif category == "spam":
            prompt = (
                f"You are an AI assistant that generates a **realistic spam SMS message**. "
                f"The message should be similar to: \"{example}\""
                "\n### Guidelines:\n"
                "1. Keep the message **short** (under 25 words) and **realistic**.\n"
                "2. Use **casual, convincing language** that fits a real spam SMS someone might receive.\n"
                "3. Avoid using placeholders like [Name] or [Date]; instead, use **common names and realistic references**.\n"
                "4. The message should include a **promotional offer, prize claim, fake urgency, or financial lure**.\n"
                "5. Create a sense of urgency, but avoid using overtly fake or misleading terms like ‘fake’ or ‘scam’.\n"
                "6. Use a **realistic-sounding company name** or a vague sender for authenticity (e.g., ‘XYZ Corp.’ or ‘Promo Team’).\n"
                "7. Wrap the generated message in **double quotes**, and do not add extra text.\n"
                "\nNow, generate the SMS message."
            )
        else:
            raise ValueError("Invalid category. Choose 'ham' or 'spam'.")

        try:
            response = llm.invoke(prompt, temperature=temperature).strip()
            messages.append(clean_message(response))
        except Exception as e:
            print(f"Error generating message: {e}")
            messages.append("[ERROR] Failed to generate message")

    # Update the progress bar after the batch is complete
    if progress_bar:
        progress_bar.update(num_samples)

    return messages

In [6]:
def generate_dataset(num_ham=500, num_spam=500, output_file="../../Data/Synthetic/synthetic_english_sms.csv", batch_size=2):
    """Generates a synthetic SMS dataset and saves it incrementally as a CSV file."""
   
    # Initialize or load existing dataset
    if os.path.exists(output_file):
        data = pd.read_csv(output_file)
        print(f"Loaded existing dataset with {len(data)} samples.")
    else:
        data = pd.DataFrame(columns=["message", "label"])
        print("No existing dataset found. Starting fresh.")

    # Track progress
    ham_generated = len(data[data["label"] == "ham"])
    spam_generated = len(data[data["label"] == "spam"])

    # Progress bars for ham and spam generation
    with tqdm(total=num_ham, initial=ham_generated, desc="Generating Ham Messages", unit="msg") as ham_progress:
        while ham_generated < num_ham:
            try:
                ham_messages = generate_sms(
                    "ham", min(batch_size, num_ham - ham_generated)
                )
                new_data = pd.DataFrame({
                    "message": ham_messages,
                    "label": ["ham"] * len(ham_messages)
                })
                data = pd.concat([data, new_data], ignore_index=True)
                data.to_csv(output_file, index=False)  # Save progress incrementally
                ham_generated += len(ham_messages)
                ham_progress.update(len(ham_messages))
            except Exception as e:
                print(f"Error generating ham messages: {e}")
                break

    with tqdm(total=num_spam, initial=spam_generated, desc="Generating Spam Messages", unit="msg") as spam_progress:
        while spam_generated < num_spam:
            try:
                spam_messages = generate_sms(
                    "spam", min(batch_size, num_spam - spam_generated)
                )
                new_data = pd.DataFrame({
                    "message": spam_messages,
                    "label": ["spam"] * len(spam_messages)
                })
                data = pd.concat([data, new_data], ignore_index=True)
                data.to_csv(output_file, index=False)  # Save progress incrementally
                spam_generated += len(spam_messages)
                spam_progress.update(len(spam_messages))
            except Exception as e:
                print(f"Error generating spam messages: {e}")
                break

    # Shuffle the dataset before saving
    data = data.sample(frac=1).reset_index(drop=True)
    data.to_csv(output_file, index=False)

    print(f"Dataset generation completed. Saved to {output_file}. Total samples: {len(data)}")

In [None]:
# Generate dataset
generate_dataset(num_ham=600, num_spam=600)


Loaded existing dataset with 1020 samples.


Generating Ham Messages: 100%|██████████| 600/600 [08:52<00:00,  5.92s/msg]
Generating Spam Messages:  85%|████████▌ | 512/600 [00:11<08:28,  5.77s/msg]

In [None]:
synthetic = pd.read_csv("../../Data/Synthetic/synthetic_english_sms.csv")
synthetic

Unnamed: 0,message,label
0,"""Where have u been hiding?"" \n""I was hiding i...",ham
1,"""I'm meeting you at [park/restaurant] with a l...",ham
2,"""Dear Matthew, call [09063440451] from a landl...",spam
3,"""Where are you? How long until we meet?""",ham
4,"""You have an important customer service announ...",spam
...,...,...
1015,"""Hi there! 🎉 Happy dancing with friends. All s...",ham
1016,"""URGENT! XYZ Corp. trying to contact John. Tod...",spam
1017,"""I just heard from [Friend's Name], and I trul...",ham
1018,"""Hi [Name], I just wanted to check if anyone w...",spam


In [None]:
# # Test improved prompt generation by generating one message per category
# def test_prompt_engineering():
#     print("Testing improved prompt engineering:")
#     ham_message = generate_sms("ham", num_samples=1)[0]
#     spam_message = generate_sms("spam", num_samples=1)[0]
    
#     print("\nGenerated Ham Message:")
#     print(ham_message)
#     print("\nGenerated Spam Message:")
#     print(spam_message)



# test_prompt_engineering()

In [None]:
# for i in range(len(synthetic)):
#     print(f"{i + 1}.[{synthetic["label"][i]}]: {synthetic["message"][i]}")