In [2]:
import pandas as pd
import csv
import re

In [3]:
# Define the input and output filenames
INPUT_CSV = '../data/Ecommerce_Delivery_Analytics_New.csv'
OUTPUT_CSV = '../data/nlu_training_data.csv'

In [4]:
try:
    df = pd.read_csv(INPUT_CSV)
except FileNotFoundError:
    print(f"Error: Input file '{INPUT_CSV}' not found.")
    print("Please make sure the script is in the same directory as the CSV file.")
except Exception as e:
    print(f"An error occurred while reading the CSV: {e}")

In [5]:
def get_sentiment(rating):
    """
    Maps a numeric service rating (1-5) to a sentiment label.
    This logic comes directly from your project proposal.
    """
    if rating in [1, 2]:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    elif rating in [4, 5]:
        return 'positive'
    return 'unknown'  # Fallback for any unexpected data

def get_intent(row):
    """
    Assigns a single, primary intent to a piece of feedback based on a
    set of rules (heuristics).
    The order of these 'if/elif' statements matters, as we check for
    the most specific intents first.
    """
    feedback = str(row['Customer Feedback']).lower()
    refund_requested = str(row['Refund Requested']).lower()
    
    # 1. Intent: report_order_content_issue
    # This is the most critical and specific intent. We check for refund
    # requests or keywords related to order content problems.
    content_keywords = ['missing', 'wrong item', 'damaged', 'expired']
    if refund_requested == 'yes' or any(keyword in feedback for keyword in content_keywords):
        return 'report_order_content_issue'
        
    # 2. Intent: comment_on_product_quality
    # Feedback specifically about the product's attributes.
    quality_keywords = ['quality', 'fresh', 'tasty', 'disappointed']
    if any(keyword in feedback for keyword in quality_keywords):
        return 'comment_on_product_quality'
        
    # 3. Intent: provide_feedback_on_service
    # Feedback about the delivery experience itself (speed, personnel).
    service_keywords = ['delivery', 'service', 'fast', 'slow', 'quick', 'reliable', 'late', 'rude']
    if any(keyword in feedback for keyword in service_keywords):
        return 'provide_feedback_on_service'
        
    # 4. Intent: comment_on_platform_experience
    # Feedback about using the app or website.
    platform_keywords = ['easy to order', 'app', 'website', 'easy']
    if any(keyword in feedback for keyword in platform_keywords):
        return 'comment_on_platform_experience'
        
    # 5. Intent: generic_unspecified_feedback (Fallback)
    # If no other specific intent is found, it's a generic comment.
    return 'generic_unspecified_feedback'

In [8]:
def create_labeled_dataset(df):
    """
    Main function to read the raw data, apply labels, and save the new dataset.
    """
    print(f"Starting labeling process for '{INPUT_CSV}'...")
    


    labeled_data = []
    
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # 1. Get Sentiment
        sentiment = get_sentiment(row['Service Rating'])
        
        # 2. Get Intent
        intent = get_intent(row)
        
        # 3. Get Text
        text = row['Customer Feedback']
        
        # Ensure text is a valid string
        if not isinstance(text, str):
            text = "" # Use empty string for non-string/NaN feedback

        # Clean the text slightly (remove extra newlines/tabs)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # We only want to train on rows that actually have feedback
        if text and sentiment != 'unknown':
            labeled_data.append({
                'text': text,
                'intent': intent,
                'sentiment': sentiment
            })

    print(f"Processed {len(df)} rows.")
    print(f"Created {len(labeled_data)} labeled training examples.")
    
    # Save the new labeled data to a CSV file
    try:
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['text', 'intent', 'sentiment'])
            writer.writeheader()
            writer.writerows(labeled_data)
        print(f"\nSuccessfully created labeled dataset: '{OUTPUT_CSV}'")
    except Exception as e:
        print(f"An error occurred while writing the new CSV: {e}")

In [9]:
create_labeled_dataset(df)

Starting labeling process for '../data/Ecommerce_Delivery_Analytics_New.csv'...
Processed 100000 rows.
Created 100000 labeled training examples.

Successfully created labeled dataset: '../data/nlu_training_data.csv'
