In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings

warnings.filterwarnings("ignore")
random.seed(42)
np.random.seed(42)

def generate_sentence(templates, vocab):
    sentence = random.choice(templates)
    for slot in set([w[1:-1] for w in sentence.split() if w.startswith('{')]):
        sentence = sentence.replace(f'{{{slot}}}', random.choice(vocab.get(slot, [''])))
    return sentence

def create_airline_dataset(n_rows):
    vocab = {
        "adj_pos": ["amazing", "smooth", "comfortable", "excellent", "friendly"],
        "adj_neg": ["delayed", "terrible", "cramped", "rude", "frustrating"],
        "noun_service": ["check-in", "boarding", "meal", "crew", "baggage"],
        "verb_pos": ["loved", "enjoyed", "appreciated", "liked"],
        "verb_neg": ["hated", "disliked", "was upset about", "couldn't stand"]
    }

    positive_templates = [
        "I {verb_pos} the {noun_service}, it was {adj_pos}.",
        "Such a {adj_pos} flight! The {noun_service} staff were great."
    ]
    negative_templates = [
        "The {noun_service} was {adj_neg}, I {verb_neg} it.",
        "{adj_neg.capitalize()} experience with {noun_service}, truly awful."
    ]
    neutral_templates = [
        "Flight was okay, nothing special about the {noun_service}.",
        "Average journey, service level felt neutral."
    ]

    data = []
    for _ in range(n_rows):
        sentiment = random.choices(["positive", "neutral", "negative"], weights=[0.4, 0.2, 0.4])[0]
        if sentiment == "positive":
            sentence = generate_sentence(positive_templates, vocab)
        elif sentiment == "negative":
            sentence = generate_sentence(negative_templates, vocab)
        else:
            sentence = generate_sentence(neutral_templates, vocab)
        data.append((sentence, sentiment))
    return pd.DataFrame(data, columns=["text", "sentiment"])

def create_ecommerce_dataset(n_rows):
    categories = {
        "Electronics": ["headphones", "smartphone", "laptop", "tablet", "camera"],
        "Clothing": ["t-shirt", "jeans", "jacket", "dress", "sneakers"],
        "Home_Appliances": ["vacuum", "blender", "air-fryer", "microwave", "kettle"]
    }

    templates = [
        "This {item} works perfectly for me.",
        "Quality of the {item} could be better.",
        "I use this {item} every day and it hasn't let me down.",
        "Not satisfied with my new {item}.",
        "The {item} is decent for the price."
    ]

    data = []
    for _ in range(n_rows):
        category = random.choice(list(categories.keys()))
        item = random.choice(categories[category])
        sentence = random.choice(templates).format(item=item)
        data.append((sentence, category))
    return pd.DataFrame(data, columns=["text", "category"])

def save_dataframe(df, path):
    df.to_csv(path, index=False)

def train_and_predict(train_df, test_df, label_column, output_prefix):
    X_train, y_train = train_df["text"], train_df[label_column]
    X_test, y_test = test_df["text"], test_df[label_column]

    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=4000, ngram_range=(1, 2))),
        ("clf", LogisticRegression(max_iter=1000))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"{output_prefix} Test Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    result_df = pd.DataFrame({
        "text": X_test,
        f"true_{label_column}": y_test,
        f"pred_{label_column}": y_pred
    })
    save_dataframe(result_df, f"synthetic_data/{output_prefix.lower()}_predictions.csv")

os.makedirs("synthetic_data", exist_ok=True)

airline_train = create_airline_dataset(5000)
airline_test = create_airline_dataset(1000)
save_dataframe(airline_train, "synthetic_data/airline_feedback_train.csv")
save_dataframe(airline_test, "synthetic_data/airline_feedback_test.csv")

ecommerce_train = create_ecommerce_dataset(5000)
ecommerce_test = create_ecommerce_dataset(1000)
save_dataframe(ecommerce_train, "synthetic_data/ecommerce_reviews_train.csv")
save_dataframe(ecommerce_test, "synthetic_data/ecommerce_reviews_test.csv")

train_and_predict(airline_train, airline_test, "sentiment", "airline_feedback")
train_and_predict(ecommerce_train, ecommerce_test, "category", "ecommerce_reviews")


airline_feedback Test Accuracy: 1.0000
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       398
     neutral       1.00      1.00      1.00       192
    positive       1.00      1.00      1.00       410

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

ecommerce_reviews Test Accuracy: 1.0000
                 precision    recall  f1-score   support

       Clothing       1.00      1.00      1.00       364
    Electronics       1.00      1.00      1.00       327
Home_Appliances       1.00      1.00      1.00       309

       accuracy                           1.00      1000
      macro avg       1.00      1.00      1.00      1000
   weighted avg       1.00      1.00      1.00      1000

