In [1]:
# Purpose: Import necessary libraries for data manipulation, API calls, and text processing.

# Standard library imports
import os
import random
import re

# Third-party library imports
import pandas as pd
from tqdm import tqdm
from google import genai
from langchain_community.llms import Ollama
from langchain.schema import AIMessage, HumanMessage

In [2]:
# Purpose: Load the Gemini API key from an environment variable.

# Load the API key from an environment variable
YOUR_API_KEY = os.environ.get("GEMINI_API_KEY")

if YOUR_API_KEY is None:
    raise ValueError("GEMINI_API_KEY environment variable not set.")

In [3]:
# Purpose: Initialize the Gemini API client and define the call_gemini function.

from google import genai
from google.genai import types

# Initialize Google Gemini API client
client = genai.Client(api_key=YOUR_API_KEY)


def call_gemini(prompt_text, temperature=0.7, max_output_tokens=500, top_p=0.95, top_k=40):
    """
    Call the Gemini API to generate content with configurable parameters.

    Args:
        prompt_text (str): The prompt text to send to the Gemini API.
        temperature (float, optional): Controls the randomness of the output. Defaults to 0.7.
        max_output_tokens (int, optional): The maximum number of tokens in the output. Defaults to 500.
        top_p (float, optional): Nucleus sampling parameter. Defaults to 0.95.
        top_k (int, optional): Top-K sampling parameter. Defaults to 40.

    Returns:
        str: The generated text response from the Gemini API.
    """
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite",  # Use the desired model -lite
        contents=prompt_text,
        config=types.GenerateContentConfig(
            temperature=temperature,          # Controls randomness
            max_output_tokens=max_output_tokens,  # Maximum tokens in the output
            topP=top_p,                       # Nucleus sampling
            topK=top_k                        # Top-K sampling
        )
    )
    return response.text  # Extract the text response

In [4]:
# Purpose: Load and preprocess the existing spam dataset.

import os

# Constants
DATASET_PATH = os.path.join('..', '..', '..', 'Data', 'English', '1. Main', 'english_sms.csv')
TEXT_COLUMN = 'v2'
CLASS_COLUMN = 'v1'
LABEL_COLUMN = 'label'
MESSAGE_COLUMN = 'message'

# Load and preprocess spam dataset
spam_data = pd.read_csv(DATASET_PATH, encoding='latin1')
spam_data = spam_data[["v1","v2"]]
spam_data = spam_data[[TEXT_COLUMN, CLASS_COLUMN]].rename(columns={CLASS_COLUMN: LABEL_COLUMN, TEXT_COLUMN: MESSAGE_COLUMN})


def load_existing_data():
    """
    Loads spam and ham messages from the dataset.

    Returns:
        tuple: A tuple containing two lists:
            - ham_samples (list): A list of ham messages.
            - spam_samples (list): A list of spam messages.
    """
    try:
        ham_samples = spam_data[spam_data[LABEL_COLUMN] == "ham"][MESSAGE_COLUMN].tolist()
        spam_samples = spam_data[spam_data[LABEL_COLUMN] == "spam"][MESSAGE_COLUMN].tolist()
        return ham_samples, spam_samples
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], []

In [5]:
# Revised generate_sms_batch function using the previous prompt template

def generate_sms_batch(category: str = "ham", num_samples: int = 10, messages_per_request: int = 5, temperature: float = 0.95):
    """
    Generates synthetic SMS messages in batches using the previous prompt template.
    Note: Text processing is removed and will be applied later on the generated dataset.

    Args:
        category (str, optional): The category of SMS messages ("ham" or "spam"). Defaults to "ham".
        num_samples (int, optional): Total number of messages to generate. Defaults to 10.
        messages_per_request (int, optional): Number of messages per API request. Defaults to 5.
        temperature (float, optional): Temperature parameter for text generation. Defaults to 0.95.

    Returns:
        List[str]: List of generated SMS messages.
    """
    ham_examples, spam_examples = load_existing_data()
    messages = []
    recent_messages = set()  # To prevent immediate repetition

    for i in range(0, num_samples, messages_per_request):
        example = random.choice(spam_examples if category == "spam" else ham_examples)
        show_example = (random.random() < 1.0)
        example_text = f"Here is an example of a {category} message: \"{example}\"" if show_example else "The messages should be natural and relevant."

        base_prompt = (
            f"You are an AI assistant tasked with generating **five realistic and natural SMS messages ({category}) in English**.\n"
            "Only generate the SMS messages without any additional text, numbering, or explanations.\n"
            f"{example_text}\n\n### Instructions:\n"
            "1. Keep each message **natural**.\n"
            "2. Use **conversational and diverse language**.\n"
            "3. Write each message **on a separate line**.\n"
            "4. Do not include any extra text, numbering, or explanations.\n"
        )

        if category == "ham":
            prompt = (
                base_prompt +
                "5. Messages can include **reminders, invitations, jokes, casual conversations, daily updates, motivational quotes, fun facts, announcements, or storytelling snippets**.\n"
                "6. Ensure the messages feel like part of a **real conversation**, not just random sentences.\n"
                "7. Use diverse topics such as **daily life, sports, food, travel, movies, family, technology, work, or school**.\n"
                "\nNow generate five natural SMS messages."
            )
        elif category == "spam":
            prompt = (
                base_prompt +
                "5. Messages should include **smart advertisements, promotional offers, lottery notifications, suspicious banking alerts, or loan offers**.\n"
                "6. Avoid overusing clichés like \"Congratulations, you won!\"; aim for **more creative and persuasive text**.\n"
                "7. Avoid being overly fake or repetitive.\n"
                "8. Use minimal emojis or symbols .\n"
                "\nNow generate five SMS messages."
            )
        else:
            raise ValueError("Invalid category. Choose 'ham' or 'spam'.")

        # Call the Gemini API using the constructed prompt
        raw_response = call_gemini(prompt, temperature=temperature)
        new_messages = raw_response.strip().split("\n")

        # Append the raw messages without processing
        for msg in new_messages:
            stripped_msg = msg.strip()
            if stripped_msg and stripped_msg not in recent_messages:
                messages.append(stripped_msg)
                recent_messages.add(stripped_msg)

    return messages

In [6]:
# Purpose: Generate a synthetic SMS dataset and save it incrementally as a CSV file.

import os
import pandas as pd
from tqdm import tqdm
from typing import List

def generate_dataset(num_ham: int = 500, num_spam: int = 500, output_file: str = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv", batch_size: int = 10) -> None:
    """
    Generates a synthetic SMS dataset and saves it incrementally as a CSV file.

    Args:
        num_ham (int, optional): The number of ham messages to generate. Defaults to 500.
        num_spam (int, optional): The number of spam messages to generate. Defaults to 500.
        output_file (str, optional): The path to the output CSV file. Defaults to "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv".
        batch_size (int, optional): The number of messages to generate in each batch. Defaults to 10.
    """

    # Initialize or load existing dataset
    if os.path.exists(output_file):
        try:
            data = pd.read_csv(output_file)
            print(f"Loaded existing dataset with {len(data)} samples.")
        except Exception as e:
            print(f"Error loading existing dataset: {e}. Starting with an empty dataset.")
            data = pd.DataFrame(columns=["message", "label"])
    else:
        data = pd.DataFrame(columns=["message", "label"])

    # Track progress
    ham_generated = len(data[data["label"] == "ham"])
    spam_generated = len(data[data["label"] == "spam"])

    # Generate ham messages
    with tqdm(total=num_ham, initial=ham_generated, desc="Generating Ham Messages", unit="msg") as ham_progress:
        while ham_generated < num_ham:
            ham_batch = generate_sms_batch(category="ham", num_samples=min(batch_size, num_ham - ham_generated),temperature=0.8)
            new_ham_data = pd.DataFrame({"message": ham_batch, "label": ["ham"] * len(ham_batch)})
            data = pd.concat([data, new_ham_data], ignore_index=True)
            ham_generated += len(ham_batch)
            ham_progress.update(len(ham_batch))

    # Generate spam messages
    with tqdm(total=num_spam, initial=spam_generated, desc="Generating Spam Messages", unit="msg") as spam_progress:
        while spam_generated < num_spam:
            spam_batch = generate_sms_batch(category="spam", num_samples=min(batch_size, num_spam - spam_generated), temperature=0.8)
            new_spam_data = pd.DataFrame({"message": spam_batch, "label": ["spam"] * len(spam_batch)})
            data = pd.concat([data, new_spam_data], ignore_index=True)
            spam_generated += len(spam_batch)
            spam_progress.update(len(spam_batch))

    # Shuffle the dataset before saving
    data = data.sample(frac=1).reset_index(drop=True)

    # Save the dataset to a CSV file
    try:
        data.to_csv(output_file, index=False)
        print(f"Dataset generation completed. Saved to {output_file}. Total samples: {len(data)}")
    except Exception as e:
        print(f"Error saving dataset to {output_file}: {e}")

In [7]:
# Purpose: Generate the synthetic SMS dataset.

# Define the output file path
output_file = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv"
# Check if the output directory exists
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    try:
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")
    except Exception as e:
        print(f"Error creating output directory: {e}. Please ensure the path is valid.")
        # Optionally, you can exit the script here if directory creation fails
        # return

# Generate dataset
generate_dataset(num_ham=1040, num_spam=1040, output_file=output_file)

Loaded existing dataset with 2011 samples.


Generating Ham Messages: 1044msg [00:18,  3.88msg/s]                         
Generating Spam Messages: 1042msg [00:01,  3.06msg/s]                 

Dataset generation completed. Saved to ../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv. Total samples: 2086





In [8]:
# Purpose: Read the generated synthetic SMS dataset.

# Define the output file path
output_file = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv"

# Read the synthetic dataset
try:
    synthetic = pd.read_csv(output_file)
    print(f"Successfully loaded synthetic dataset from {output_file} with {len(synthetic)} rows.")
    print(synthetic)  # Display the first few rows
except FileNotFoundError:
    print(f"Error: The file {output_file} was not found. Please ensure the dataset has been generated.")
except Exception as e:
    print(f"An error occurred while reading the dataset: {e}")

Successfully loaded synthetic dataset from ../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv with 2086 rows.
                                                message label
0     Hey! Just saw your recent Instagram post. Thin...  spam
1     Urgent! Your loan application is approved. Get...  spam
2     Want to grab pizza and watch the game tonight?...   ham
3     🔥💰 Limited-time offer: Upgrade your phone for ...  spam
4     You've been selected as a finalist for our $10...  spam
...                                                 ...   ...
2081  Crazy weather, huh? Hope you're having a good ...   ham
2082  Hey Chris Martinez, your ticket number has won...  spam
2083   On my way to the gym, wanna join? Leg day today!   ham
2084  Big presentation went well! Time to celebrate ...   ham
2085  Hey, still keen for the game tonight? Kick off...   ham

[2086 rows x 2 columns]


In [9]:
# Purpose: Extract the SMS messages from the synthetic dataset into a list.

# Extract messages from the synthetic dataset
if not synthetic.empty:
    messages = synthetic["message"].tolist()
    print(f"Extracted {len(messages)} messages from the synthetic dataset.")
else:
    print("The synthetic dataset is empty. No messages were extracted.")
    messages = []

Extracted 2086 messages from the synthetic dataset.


.

.

.

.

.


### lets define some function  to prcec the output samples

In [10]:
# Purpose: Consolidate all text processing functions into a single cell.

import re
from typing import List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def clean_message(raw_text: str) -> str:
    """
    Removes the AI's thought process enclosed in <think>...</think> tags.

    Args:
        raw_text (str): The input text that may contain <think>...</think> tags.

    Returns:
        str: The cleaned text with the <think>...</think> tags removed.
    """
    return re.sub(r"<think>.*?</think>", "", raw_text, flags=re.DOTALL).strip()

def clean_numbered_messages(message: str) -> str:
    """
    Remove numbers and dots at the beginning of an English SMS message.

    Args:
        message (str): The input message containing numbers and dots.

    Returns:
        str: The cleaned message without leading numbers and dots.
    """
    return re.sub(r"^\d+\.\s*", "", message).strip()

def remove_numbers_from_messages(message: str) -> str:
    """
    Remove numbers and associated punctuation at the beginning of an English SMS message.

    Args:
        message (str): The input message containing numbers and punctuation.

    Returns:
        str: The cleaned message without leading numbers and punctuation.
    """
    return re.sub(r"^\s*\d+[.\)\-:\s]?\s*", "", message).strip()

def replace_placeholders(message: str) -> str:
    """
    Replace placeholders in the generated English SMS message with realistic values.

    Args:
        message (str): The input message containing placeholders.

    Returns:
        str: The message with placeholders replaced by realistic values.
    """
    import random

    fake_phone = f"+1-{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}"
    fake_account_number = f"{random.randint(4000, 4999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}"
    fake_url = f"https://www.{random.choice(['example', 'myshop', 'secure', 'bestdeals', 'onlinestore'])}{random.randint(1, 999)}.com"
    fake_website_name = f"{random.choice(['example', 'myshop', 'secure', 'bestdeals', 'onlinestore'])}{random.randint(1, 999)}.com"
    fake_store_name = f"Store {random.choice(['USA', 'Online', 'Global', 'Tech', 'Mart', 'Shop', 'Express', 'Depot', 'Bazaar', 'Outlet'])}"
    fake_name = f"{random.choice(['John', 'Jane', 'Alex', 'Emily', 'Chris', 'Taylor', 'Jordan', 'Morgan', 'Sam', 'Casey'])} {random.choice(['Smith', 'Johnson', 'Brown', 'Williams', 'Jones', 'Garcia', 'Miller', 'Davis', 'Martinez', 'Hernandez'])}"
    fake_telegram_link = f"https://t.me/fakechannel{random.randint(1, 10)}"
    fake_code = f"{random.randint(1000, 9999)}"

    replacements = {
        "[FAKE_LINK]": fake_url,
        "[PHISHING_LINK]": fake_url,
        "[LINK]": fake_url,
        "[Link]": fake_url,
        "[link]": fake_url,
        "[Name]": fake_name,
        "[name]": fake_name,
        "[store name]": fake_store_name,
        "[Store Name]": fake_store_name,
        "[shop name]": fake_store_name,
        "[Shop Name]": fake_store_name,
        "[ACCOUNT_NUMBER]": fake_account_number,
        "[WEBSITE_NAME]": fake_website_name,
        "[website address]": fake_website_name,
        "[Link to Site]": fake_url,
        "[link to site]": fake_url,
        "[website link]": fake_website_name,
        "[STORE_LINK]": fake_url,
        "[WEBSITE_LINK]": fake_url,
        "[SITE_LINK]": fake_url,
        "[INSTITUTE_NAME]": fake_store_name,
        "[SITE_ADDRESS]": fake_url,
        "[REGISTRATION_LINK]": fake_url,
        "[TELEGRAM_LINK]": fake_telegram_link,
        "[FAKE_PHONE_NUMBER]": fake_phone,
        "[PHONE_NUMBER]": fake_phone,
        "[number]": fake_phone,
        "[PHONE]": fake_phone,
        "[SHORT_LINK]": fake_url,
        "[ADDRESS]": fake_url,
        "[LOAN_LINK]": fake_url,
        "[PURCHASE_LINK]": fake_url,
        "[ACTIVATION_CODE]": fake_code,
        "[PROMOTIONAL_LINK]": fake_url,
        "[SHORTENED_LINK]": fake_url,
        "[FAKE_LINK]": fake_url,
        "[REGISTRATION_LINK]": fake_url,
        "[SCAM_LINK]": fake_url,
        "[INVALID_LINK]": fake_url,
        "[FAKE_CODE]": fake_code,
        "[DISCOUNT_CODE]": fake_code,
        "[VERIFICATION_CODE]": fake_code,
        "[AUTHENTICATION_CODE]": fake_code,
        "[ACTIVATION_CODE]": fake_code,
        "[CONTACT_NUMBER]": fake_phone,
        "[PHONE]": fake_phone,
        "[STORE_NAME]": fake_store_name,
        "[ADDRESS/CONTACT]": fake_store_name,
    }

    for placeholder, replacement in replacements.items():
        message = message.replace(placeholder, replacement)

    return message

from typing import List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def identify_similar_messages(messages: List[str], threshold: float = 0.8) -> Tuple[List[str], List[Tuple[str, str, float]], List[int]]:
    """
    Identifies and removes pairs of messages that are too similar to each other based on cosine similarity.

    Args:
        messages (List[str]): A list of SMS messages.
        threshold (float, optional): The similarity threshold. Defaults to 0.8.

    Returns:
        Tuple[List[str], List[Tuple[str, str, float]], List[int]]: A tuple containing:
            - A list of unique messages.
            - A list of similar message pairs with their similarity scores.
            - A list of indices of the unique messages in the original list.
    """
    # Vectorize the messages using TF-IDF
    vectorizer = TfidfVectorizer().fit_transform(messages)
    similarity_matrix = cosine_similarity(vectorizer)

    similar_pairs = []
    unique_messages = []
    unique_indices = []  # Keep track of the indices of unique messages

    for i, message in enumerate(messages):
        is_unique = True
        for unique_index in unique_indices:
            # Calculate cosine similarity between the current message and the unique message
            similarity = similarity_matrix[i, unique_index]

            # If the similarity is above the threshold, mark the current message as not unique
            if similarity > threshold:
                similar_pairs.append((messages[i], messages[unique_index], similarity))
                is_unique = False
                break

        # If the message is unique, add it to the list of unique messages and its index to the list of unique indices
        if is_unique:
            unique_messages.append(messages[i])
            unique_indices.append(i)

    return unique_messages, similar_pairs, unique_indices

.



.



.



.



.


#### now let's execute them on the data

In [11]:
# Revised cell: Process synthetic SMS text with progress bars and detailed deletion logs

import pandas as pd
import numpy as np
from tqdm import tqdm

def process_synthetic_text_with_progress(df):
    """
    Applies text processing functions to the 'message' column with progress bars.
    Handles missing values, identifies similar messages, and saves the cleaned dataset.

    Args:
        df (pd.DataFrame): DataFrame containing the synthetic SMS messages.

    Returns:
        pd.DataFrame: DataFrame with processed messages.
    """
    # Step 1: Handle missing values
    print("Step 1: Handling missing values...")
    with tqdm(total=len(df), desc="Filling NaN values", unit="msg") as pbar:
        df["message"] = df["message"].fillna("")
        pbar.update(len(df))

    # Step 2: Apply cleaning functions
    print("Step 2: Cleaning messages...")
    with tqdm(total=len(df), desc="Cleaning messages", unit="msg") as pbar:
        df["message"] = df["message"].apply(
            lambda x: replace_placeholders(
                remove_numbers_from_messages(
                    clean_message(str(x))
                )
            )
        )
        pbar.update(len(df))

    # Step 3: Identify and remove similar messages
    print("Step 3: Identifying and removing similar messages...")
    messages = df["message"].tolist()
    unique_messages, similar_pairs, unique_indices = identify_similar_messages(messages, threshold=0.75)

    # Log similar messages that will be removed
    if similar_pairs:
        print("Similar messages identified for removal:")
        for msg1, msg2, similarity in similar_pairs:
            print(f"Message 1: {msg1}")
            print(f"Message 2: {msg2}")
            print(f"Similarity: {similarity:.4f}")
            print("-" * 50)

    # Progress bar for removing similar messages
    with tqdm(total=len(messages) - len(unique_messages), desc="Removing similar messages", unit="msg") as pbar:
        df = df.iloc[unique_indices].copy()
        pbar.update(len(messages) - len(unique_messages))

    # Reset index after processing
    df.reset_index(drop=True, inplace=True)
    return df


# Define the path to the synthetic dataset CSV file
input_file = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv"
output_file = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv"  # New output file

# Load the synthetic dataset, process the text, and save it to a new file
try:
    synthetic_df = pd.read_csv(input_file)
    print(f"Loaded synthetic dataset with {len(synthetic_df)} rows.")
    print("Sample before processing:")
    print(synthetic_df.head())

    processed_df = process_synthetic_text_with_progress(synthetic_df.copy())

    # Save the processed dataset to a NEW CSV file
    processed_df.to_csv(output_file, index=False)
    print(f"Synthetic dataset text processed and saved successfully to {output_file}")

except FileNotFoundError:
    print(f"Error: Input file not found: {input_file}")
except Exception as e:
    print(f"Error processing synthetic dataset: {e}")

Loaded synthetic dataset with 2086 rows.
Sample before processing:
                                             message label
0  Hey! Just saw your recent Instagram post. Thin...  spam
1  Urgent! Your loan application is approved. Get...  spam
2  Want to grab pizza and watch the game tonight?...   ham
3  🔥💰 Limited-time offer: Upgrade your phone for ...  spam
4  You've been selected as a finalist for our $10...  spam
Step 1: Handling missing values...


Filling NaN values: 100%|██████████| 2086/2086 [00:00<?, ?msg/s]


Step 2: Cleaning messages...


Cleaning messages: 100%|██████████| 2086/2086 [00:00<00:00, 57574.18msg/s]

Step 3: Identifying and removing similar messages...





Similar messages identified for removal:
Message 1: Did you see the game last night?! Crazy finish!
Message 2: Hey, did you see the game last night?! Crazy finish!
Similarity: 0.9778
--------------------------------------------------
Message 1: Ugh, my phone died. Can you send me the address again?
Message 2: Ugh, my phone died. Can you send me the address again?
Similarity: 1.0000
--------------------------------------------------
Message 1: Hey, did you finish that report yet?
Message 2: Did you finish that report yet?
Similarity: 0.9680
--------------------------------------------------
Message 1: Hey, did you see that crazy play last night?!
Message 2: Hey, did you see that crazy play last night?!
Similarity: 1.0000
--------------------------------------------------
Message 1: Hey, still up for dinner tonight? Thinking Italian!
Message 2: Hey, still up for dinner tonight? Thinking Italian?
Similarity: 1.0000
--------------------------------------------------
Message 1: Heads up, th

Removing similar messages: 100%|██████████| 36/36 [00:00<?, ?msg/s]

Synthetic dataset text processed and saved successfully to ../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv





In [12]:
synthetic = pd.read_csv("../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv")
synthetic

Unnamed: 0,message,label
0,Hey! Just saw your recent Instagram post. Thin...,spam
1,Urgent! Your loan application is approved. Get...,spam
2,Want to grab pizza and watch the game tonight?...,ham
3,🔥💰 Limited-time offer: Upgrade your phone for ...,spam
4,You've been selected as a finalist for our $10...,spam
...,...,...
2045,"Crazy weather, huh? Hope you're having a good ...",ham
2046,"Hey Chris Martinez, your ticket number has won...",spam
2047,"On my way to the gym, wanna join? Leg day today!",ham
2048,Big presentation went well! Time to celebrate ...,ham


In [13]:
for i in range(len(synthetic)):
    print(f"{i + 1}.[{synthetic['label'][i]}]: {synthetic['message'][i]}")

1.[spam]: Hey! Just saw your recent Instagram post. Thinking of upgrading your phone? Check out our deals: https://www.onlinestore276.com
2.[spam]: Urgent! Your loan application is approved. Get funds today. Apply here: https://www.onlinestore256.com
3.[ham]: Want to grab pizza and watch the game tonight? My treat!
4.[spam]: 🔥💰 Limited-time offer: Upgrade your phone for only $99! Shop now before they're gone: https://www.onlinestore256.com
5.[spam]: You've been selected as a finalist for our $1000 shopping spree! Complete your registration to claim your prize: [fake prize website]
6.[ham]: Hey, did you remember to DVR the game tonight? I'm gonna be late getting home.
7.[ham]: That meeting got pushed back to Thursday, FYI.
8.[spam]: Claim your free gift card worth $100! Click here to redeem: https://www.secure893.com Offer ends soon! 🔥
9.[spam]: You've been selected for a prize draw! Claim your reward: https://www.onlinestore91.com
10.[ham]: OMG, you won't believe what happened at the c

In [14]:
input_file = "../../../Data/English/2. Synthetic/synthetic_english_sms_gemini.csv"

synthetic = pd.read_csv(input_file, encoding='latin1')

In [15]:
synthetic

Unnamed: 0,message,label
0,Hey! Just saw your recent Instagram post. Thin...,spam
1,Urgent! Your loan application is approved. Get...,spam
2,Want to grab pizza and watch the game tonight?...,ham
3,ð¥ð° Limited-time offer: Upgrade your phon...,spam
4,You've been selected as a finalist for our $10...,spam
...,...,...
2045,"Crazy weather, huh? Hope you're having a good ...",ham
2046,"Hey Chris Martinez, your ticket number has won...",spam
2047,"On my way to the gym, wanna join? Leg day today!",ham
2048,Big presentation went well! Time to celebrate ...,ham
