In [1]:
import pandas as pd
import openai
from getpass import getpass
import time
import json
import re

In [2]:
df = pd.read_csv('../Data/combined_sms.csv')

In [3]:
api_key = getpass("Enter your OpenAI API key: ")
openai.api_key = api_key

Enter your OpenAI API key:  ········


In [4]:
MODEL = "gpt-4o-mini"  

In [13]:
system_prompt = """
You are an advanced multilingual SMS classification system capable of processing and analyzing messages. Your task is to classify SMS messages, extract transaction details when present, and determine their importance and spam status. Additionally, extract appointment dates and test dates if mentioned.

### **Instructions**:
1. **Classification**:
   - Classify the message into one or more categories:
     - **Money/Financial**: Messages related to financial transactions. Subcategories include:
       - **Income**: Salary, bonuses, or other earnings.
       - **Expense**: Payments, bills, or purchases.
       - **Savings**: Deposits into savings accounts or investments.
       - **Investments**: Stock market, mutual funds, or other investments.
       - **Loans**: Loan disbursements, repayments, or related updates.
     - **Notification**: Messages that provide alerts, reminders, or updates (e.g., appointment reminders, delivery notifications).
     - **Promotion**: Messages offering discounts, deals, or special offers.
     - **Advertising**: Messages promoting products, services, or brands.
     - **Health**: Messages related to healthcare, appointments, prescriptions, or wellness.
     - **Travel**: Messages related to flights, hotels, or travel bookings.
     - **Education**: Messages related to courses, exams, or educational updates.
     - **Government**: Messages related to government services, public announcements, or official notifications.
     - **Other**: Messages that do not fit into any of the above categories.

2. **Transaction Extraction**:
   - If the message mentions a financial transaction, extract:
     - **Amount**: The financial amount mentioned in the message.
     - **Type**: Either "income," "expense," "savings," "investment," or "loan."
     - **Account**: Partial or full account number mentioned in the message.
     - **Date**: Extract the transaction date.
   - Return these details in a new key, `"transactions"`, in the output JSON.

3. **Appointment and Test Date Extraction**:
   - If the message mentions an appointment or test/exam date, extract:
     - **Appointment Date**: The date mentioned for appointments.
     - **Test Date**: The date mentioned for tests or exams.
   - Return these details in a new key, `"dates"`, in the output JSON.

4. **Importance and Spam**:
   - **Importance**: Mark as important (1) if the message contains financial transactions, appointments, or test dates. Otherwise, mark as not important (0).
   - **Spam**: Mark as spam (1) if the message is promotional or advertising. Otherwise, mark as not spam (0).

5. **Output Format**:
   - Always return the result in **valid JSON format** with these keys:
     - `"category_labels"`: List of categories.
     - `"is_important"`: Binary value (1 or 0).
     - `"is_spam"`: Binary value (1 or 0).
     - `"transactions"`: Transaction details if applicable; otherwise, `null`.
     - `"dates"`: Appointment or test dates if applicable; otherwise, `null`.

---

### **Examples**:
#### Example 1:
Input: "Your doctor's appointment is scheduled for 15-10-2023 at 10:00 AM."
Output: {
  "category_labels": ["Notification", "Appointment"],
  "is_important": 1,
  "is_spam": 0,
  "transactions": null,
  "dates": {"appointment_date": "15-10-2023"}
}

#### Example 2:
Input: "Your final exam for Math 101 is on 20-12-2023."
Output: {
  "category_labels": ["Notification", "Test/Exam"],
  "is_important": 1,
  "is_spam": 0,
  "transactions": null,
  "dates": {"test_date": "20-12-2023"}
}

#### Example 3:
Input: "Your account ******4521 has been credited with $1000 as salary. Date: 05-05-2023. Your next appointment is on 10-05-2023."
Output: {
  "category_labels": ["Money/Financial", "Notification", "Appointment"],
  "is_important": 1,
  "is_spam": 0,
  "transactions": {"amount": "$1000", "type": "income", "account": "******4521", "date": "05-05-2023"},
  "dates": {"appointment_date": "10-05-2023"}
}

#### Example 4:
Input: "Your lab test results are ready. Test date: 01-11-2023."
Output: {
  "category_labels": ["Notification", "Test/Exam"],
  "is_important": 1,
  "is_spam": 0,
  "transactions": null,
  "dates": {"test_date": "01-11-2023"}
}

#### Example 5:
Input: "Your bill payment of $50 is due tomorrow. Your next appointment is on 12-12-2023."
Output: {
  "category_labels": ["Money/Financial", "Expense", "Notification", "Appointment"],
  "is_important": 1,
  "is_spam": 0,
  "transactions": {"amount": "$50", "type": "expense", "account": null, "date": null},
  "dates": {"appointment_date": "12-12-2023"}
}

---

### **Advanced Notes**:
- If a message contains both a transaction date and an appointment/test date, both will be extracted and included in the output.
- If no date is found for appointments or tests, the `dates` field will be `null`.
- Use multilingual support to handle both English and Arabic content effectively.
"""

In [16]:
def parse_json(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        # Attempt to fix common JSON issues (e.g., missing quotes)
        try:
            json_str = re.sub(r"(\w+):", r'"\1":', json_str)  # Add quotes around keys
            return json.loads(json_str)
        except:
            return None

def get_labels(message):
    try:
        # Create the user prompt
        user_prompt = f"Now classify this message: {message}"

        # Call the OpenAI API using the enhanced prompt
        completion = openai.ChatCompletion.create(
            model=MODEL, 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=200,  
            temperature=0  
        )

        # Get the raw response
        raw_response = completion.choices[0].message.content
        print(f"Raw response: {raw_response}")

        # Extract JSON-like string using regex
        json_match = re.search(r'\{[\s\S]*\}', raw_response)
        if json_match:
            json_str = json_match.group(0)
            parsed_json = parse_json(json_str)
            if parsed_json:
                return parsed_json

        print(f"Invalid JSON response for message: {message}")
        return {
            "category_labels": [],
            "is_important": 0,
            "is_spam": 0,
            "transactions": None,
            "dates": None
        }
    except Exception as e:
        print(f"Error processing message: {message}. Error: {e}")
        return {
            "category_labels": [],
            "is_important": 0,
            "is_spam": 0,
            "transactions": None,
            "dates": None
        }

In [None]:
results = []
for index, row in df.iterrows():
    message = row["Message Content"]
    print(f"Processing message {index + 1}/{len(df)}: {message}")
    labels = get_labels(message)
    results.append(labels)
    time.sleep(1)  

In [19]:
df[["category_labels", "is_important", "is_spam", "transactions" , "dates"]] = pd.DataFrame(results)

In [20]:
df.head()

Unnamed: 0,Sender,Message Content,category_labels,is_important,is_spam,transactions,dates
0,samba.,تم خصم مبلغ ٥٠٫٠٠ من حساب ******٤٥٢١ في ٠٥-٠٥-...,"[Money/Financial, Expense]",1,0,"{'amount': '٥٠٫٠٠', 'type': 'expense', 'accoun...",
1,607941,Your WhatsApp code is 614-968 but you can simp...,[Other],0,0,,
2,neqaty,"Dear Member, you have not redeemed any Neqaty ...","[Promotion, Advertising]",0,1,,
3,neqaty,عزيزي العميل، لقد مر 17 شهر و لم تقم باي عملية...,"[Notification, Promotion]",0,1,,
4,606006,"BIG SAVINGS! Now get 200 Mobily Minutes, 200 M...","[Promotion, Advertising]",0,1,,


In [57]:
df.to_csv("../Data/labeled_combined_sms_v2.csv", index=False)
print("Labeling complete! Dataset saved to 'labeled_combined_sms.csv'.")

Labeling complete! Dataset saved to 'labeled_combined_sms.csv'.
