# Data Processing

This document outlines the data processing for mapping categories, identifying financial transactions, and extracting dates from messages.


In [None]:
import pandas as pd


df = pd.read_csv('labeled_combined_sms_v2.csv')


## Step 1: Mapping `category_labels` to Parent Categories

map specific `category_labels` values to generalized parent categories like Financial, Promotional, Governmental, etc.

1. **Unique Labels Extraction:**
Extracts all unique values in the category_labels column for analysis.

In [19]:
unique_labels = df['category_labels'].unique()
for label in unique_labels:
    print(label)

['Money/Financial', 'Expense']
['Other']
['Promotion', 'Advertising']
['Notification', 'Promotion']
['Money/Financial', 'Income']
['Notification']
['Notification', 'Health']
['Advertising']
['Notification', 'Other']
['Money/Financial', 'Expense', 'Notification']
['Notification', 'Promotion', 'Advertising']
['Notification', 'Advertising']
['Advertising', 'Promotion']
['Money/Financial', 'Expense', 'Other']
['Notification', 'Education']
['Promotion', 'Education']
['Government', 'Notification']
['Education']
['Notification', 'Education', 'Test/Exam']
['Advertising', 'Other']
['Health', 'Promotion', 'Advertising']
['Notification', 'Test/Exam']
['Health']
['Promotion', 'Other']
['Education', 'Notification']
['Notification', 'Security']
['Health', 'Promotion']
['Notification', 'Appointment']
['Notification', 'Event']
['Education', 'Promotion']
['Notification', 'Government']
['Notification', 'Test/Exam', 'Education']
['Health', 'Other']
['Education', 'Advertising']
['Government', 'Other']
['P

2. **Category Mapping:**
A dictionary category_mapping is created to map subcategories to parent categories.

In [20]:
category_mapping = {
"Financial": [
        "['Money/Financial']", "['Money/Financial', 'Expense']", 
        "['Money/Financial', 'Income']", "['Money/Financial', 'Savings']", 
        "['Money/Financial', 'Investments']", "['Money/Financial', 'Transaction']",
        "['Money/Financial', 'Expense', 'Notification']", "['Money/Financial', 'Expense', 'Travel']", 
        "['Money/Financial', 'Donation']", "['Money/Financial', 'Expense', 'Government']",
        "['Money/Financial', 'Expense', 'Other']",
        "['Money/Financial', 'Notification']", "['Money/Financial', 'Income', 'Notification']", "['Money/Financial', 'Transfer']", "['Money/Financial', 'Expense', 'Advertising']",
        "['Money/Financial', 'Income', 'Advertising']", "['Money/Financial', 'Expense', 'Promotion']", "['Money/Financial', 'Income', 'Promotion']", "['Money/Financial', 'Expense', 'Notification', 'Advertising']", "['Money/Financial', 'Promotion', 'Advertising']",
        "['Money/Financial', 'Expense', 'Promotion', 'Advertising']", "['Money/Financial', 'Notification', 'Promotion']", "['Money/Financial', 'Promotion']", 
        "['Notification', 'Money/Financial']", "['Notification', 'Money/Financial', 'Loans']", "['Money/Financial', 'Investment']", "['Money/Financial', 'Expense', 'Notification', 'Government']",
        "['Money/Financial', 'Notification', 'Government']"
    ],
    "Promotional": [
        "['Promotion', 'Advertising']", "['Notification', 'Promotion']", 
        "['Promotion', 'Notification']", "['Notification', 'Advertising']", 
        "['Promotion', 'Event']", "['Promotion', 'Travel']", 
        "['Promotion', 'Education']", "['Promotion', 'Health', 'Education']", "['Notification', 'Promotion', 'Advertising']", "['Advertising', 'Promotion']",
        "['Advertising', 'Other']", "['Promotion', 'Advertising', 'Notification']", "['Promotion', 'Other']", "['Notification', 'Event']", "['Promotion', 'Notification', 'Expense']", "['Notification', 'Advertising', 'Promotion']", "['Advertising', 'Notification']",
        "['Notification', 'Promotion', 'Other']", "['Promotion', 'Government']", "['Promotion', 'Advertising', 'Government']", "['Promotion', 'Notification', 'Event']",
        "['Notification', 'Promotion', 'Travel']", "['Promotion', 'Notification', 'Education']", "['Promotion', 'Advertising', 'Travel']", "['Promotion', 'Investment']"
    ],
    "Governmental": [
        "['Government']", "['Government', 'Notification']", 
        "['Government', 'Health']", "['Government', 'Other']", 
        "['Government', 'Notification', 'Other']", "['Government', 'Notification', 'Money/Financial', 'Expense']", "['Notification', 'Government']","['Government', 'Notification', 'Test/Exam']",
        "['Notification', 'Government', 'Other']", "['Advertising', 'Government']", "['Government', 'Advertising']"
    ],
    "Telecommunications": [
        "['Notification']", "['Notification', 'Other']", 
        "['Notification', 'Travel']", "['Notification', 'Emergency']", "['Notification', 'Security']", "['Notification', 'Appointment']", "['Notification', 'Delivery']"
    ],
    "Education": [
        "['Education']", "['Education', 'Notification']", 
        "['Education', 'Promotion']", "['Education', 'Notification', 'Advertising']", 
        "['Education', 'Notification', 'Test/Exam']", "['Education', 'Notification', 'Appointment']", "['Notification', 'Education']", "['Notification', 'Test/Exam']",
        "'Notification', 'Education', 'Test/Exam']", "['Notification', 'Education', 'Test/Exam']", "['Notification', 'Test/Exam', 'Education']", "['Education', 'Advertising']",
        "['Education', 'Promotion', 'Advertising']", "['Notification', 'Government', 'Education']", "['Notification', 'Education', 'Appointment']",
        "['Advertising', 'Education']", "['Education', 'Other']", "['Notification', 'Appointment', 'Education']"
    ],
    "Travel": [
        "['Travel', 'Promotion']", "['Notification', 'Travel']", 
        "['Promotion', 'Travel', 'Advertising']", "['Notification', 'Travel', 'Promotion']", "['Notification', 'Travel', 'Other']",
        "['Travel', 'Notification']"
    ],
    "Health": [
        "['Health']", "['Health', 'Promotion']", "['Notification', 'Health']",
        "['Health', 'Notification']", "['Health', 'Government']", 
        "['Health', 'Advertising']", "['Health', 'Government', 'Notification', 'Advertising']", "['Health', 'Promotion', 'Advertising']", "['Health', 'Other']",
        "['Notification', 'Health', 'Appointment']", "['Promotion', 'Education', 'Health']", "['Government', 'Health', 'Notification']", "['Advertising', 'Health']", "['Health', 'Government', 'Other']", "['Health', 'Notification', 'Advertising']", "['Government', 'Health', 'Notification', 'Advertising']",
        "['Health', 'Government', 'Notification']", "['Notification', 'Health', 'Test/Exam']", "['Health', 'Notification', 'Test/Exam']", "['Health', 'Notification', 'Education']",
        "['Health', 'Notification', 'Government']"
    ],
    "Other": [
        "['Other']", "['Other', 'Notification']", 
        "['Other', 'Advertising']", "['Notification', 'Emergency']", "['Other', 'Warning']", 
    ],
    "Services or Stores": [
        "['Promotion', 'Advertising']", "['Advertising']", 
        "['Other']"
    ]
}

# Function to map category_labels to parent category
def map_category(category_labels):
    for parent_category, subcategories in category_mapping.items():
        if category_labels in subcategories:
            return parent_category
    return 'Unknown'  # Default if no match is found

# Apply the function to create the new category column
df['Category'] = df['category_labels'].apply(map_category)


## Step 2: Classifying Financial Transactions

Classify financial messages as **Income** or **Expense** based on keywords in the message content.

In [21]:
# Define keywords for income and expense
income_keywords = ['ايداع', 'deposit', 'credited', 'Incoming', 'واردة',]
expense_keywords = ['الخصم','خصم',  'سحب', 'شراء', 'سداد', 'deduction', 'debited', 'withdrawn', 'debit', 'Purchase', 'صادرة']

# Function to classify messages
def classify_transaction(row):
    if row['Category'] == 'Financial':
        if any(keyword in row['Message Content'] for keyword in income_keywords):
            return 'Income'
        elif any(keyword in row['Message Content'] for keyword in expense_keywords):
            return 'Expense'
    return 'None'

# Apply the function conditionally
df['transaction_type'] = df.apply(classify_transaction, axis=1)

## Step 3: Extracting Dates from Messages

Extract dates from the **Message Content** column and save them into a new column named **Date**.

In [22]:
import dateparser
import re

# Function to extract date using regex
def extract_date_with_regex(content):
    # Regex pattern for dates (Arabic or English)
    pattern = r'\d{2}-\d{2}-\d{4}'
    match = re.search(pattern, content)
    if match:
        # Parse the matched date
        return dateparser.parse(match.group(), languages=['en', 'ar'])
    return None

# Extract dates and save in a new column 'Date'
df['Date'] = df['Message Content'].apply(extract_date_with_regex)

## Save the data

In [None]:
df.to_csv("Preprocessed_Data.csv", index=False)