loading csv file

In [14]:
import pandas as pd
import numpy as np


df = pd.read_csv('sofmattress_train.csv', sep='\t')
df[['sentence', 'label']] = df.iloc[:, 0].str.split(',', expand=True)
print(df)

                                        sentence,label  \
0                     You guys provide EMI option?,EMI   
1    Do you offer Zero Percent EMI payment options?...   
2                                          0% EMI.,EMI   
3                                              EMI,EMI   
4                            I want in installment,EMI   
..                                                 ...   
323          May I please know about the offers,OFFERS   
324                            Available offers,OFFERS   
325                          Is offer available,OFFERS   
326                  Want to know the discount ,OFFERS   
327             Tell me about the latest offers,OFFERS   

                                           sentence   label  
0                      You guys provide EMI option?     EMI  
1    Do you offer Zero Percent EMI payment options?     EMI  
2                                           0% EMI.     EMI  
3                                               EMI    

processing text

In [15]:
import re

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text


df['clean_sentence'] = df['sentence'].apply(preprocess_text)
print(df['clean_sentence'])
    

0                        you guys provide emi option
1      do you offer zero percent emi payment options
2                                              0 emi
3                                                emi
4                              i want in installment
                           ...                      
323               may i please know about the offers
324                                 available offers
325                               is offer available
326                        want to know the discount
327                  tell me about the latest offers
Name: clean_sentence, Length: 328, dtype: object


training

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


X = df['clean_sentence']
y = df['label']


vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      0.75      0.86         4
   ABOUT_SOF_MATTRESS       0.60      1.00      0.75         3
         CANCEL_ORDER       0.67      1.00      0.80         2
        CHECK_PINCODE       1.00      1.00      1.00         1
                  COD       0.50      1.00      0.67         2
           COMPARISON       0.50      1.00      0.67         1
    DELAY_IN_DELIVERY       0.00      0.00      0.00         2
         DISTRIBUTORS       0.75      0.75      0.75         8
                  EMI       1.00      0.80      0.89         5
        ERGO_FEATURES       1.00      0.75      0.86         4
             LEAD_GEN       1.00      0.75      0.86         4
        MATTRESS_COST       1.00      1.00      1.00         3
               OFFERS       1.00      0.67      0.80         3
         ORDER_STATUS       0.50      1.00      0.67         1
       ORTHO_FEATURES       1.00      1.00      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test the model

In [17]:
# Function to predict single input
def predict_intent(text, model, vectorizer):
    # Clean the input text using the same cleaning function used during training
    cleaned_text = preprocess_text(text)
    
    # Transform the text using the same vectorizer
    text_vectorized = vectorizer.transform([cleaned_text])
    
    # Get prediction
    prediction = model.predict(text_vectorized)
    
    # Get prediction probabilities
    probabilities = model.predict_proba(text_vectorized)
    confidence = np.max(probabilities) * 100
    
    return prediction[0], confidence

# Test with different examples
test_examples = [
    "Do you have EMI options?",
    "What are the current offers?",
    "I want to know about warranty",
    "How can I cancel my order?",
    "What is the cost of mattress?"
]

# Print predictions
print("\nPredictions for test examples:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)

# Interactive testing
while True:
    user_input = input("\nEnter your text (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    
    intent, confidence = predict_intent(user_input, model, vectorizer)
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")



Predictions for test examples:
--------------------------------------------------
Text: Do you have EMI options?
Predicted Intent: EMI
Confidence: 16.53%
--------------------------------------------------
Text: What are the current offers?
Predicted Intent: OFFERS
Confidence: 22.37%
--------------------------------------------------
Text: I want to know about warranty
Predicted Intent: WARRANTY
Confidence: 23.81%
--------------------------------------------------
Text: How can I cancel my order?
Predicted Intent: CANCEL_ORDER
Confidence: 43.80%
--------------------------------------------------
Text: What is the cost of mattress?
Predicted Intent: MATTRESS_COST
Confidence: 26.04%
--------------------------------------------------


In [5]:
# Get unique values from column E
unique_values = df['label'].unique()

# Display the unique values
print(unique_values)

['EMI' 'COD' 'ORTHO_FEATURES' 'ERGO_FEATURES' 'COMPARISON' 'WARRANTY'
 '100_NIGHT_TRIAL_OFFER' 'SIZE_CUSTOMIZATION' 'WHAT_SIZE_TO_ORDER'
 'LEAD_GEN' 'CHECK_PINCODE' 'DISTRIBUTORS' 'MATTRESS_COST'
 'PRODUCT_VARIANTS' 'ABOUT_SOF_MATTRESS' 'DELAY_IN_DELIVERY'
 'ORDER_STATUS' 'RETURN_EXCHANGE' 'CANCEL_ORDER' 'PILLOWS' 'OFFERS']


try creating more training data by llm

In [1]:
from transformers import pipeline, logging
import pandas as pd
import torch

# Clear GPU cache and setup logging
torch.cuda.empty_cache()
logging.set_verbosity_info()

# Initialize generator pipeline
generator = pipeline(
    'text-generation',
    model='gpt2',
    device=0,
    truncation=True,
    max_new_tokens=50,
    batch_size=1,
    pad_token_id=50256  # Set padding token explicitly
)
def generate_examples(base_prompt, example_queries, num_examples=15):
    prompt = f"""Task: Generate a customer service questions asked by customers to a mattresses selling company.
Topic: {base_prompt}
Rules: 
- Question should be less than 10 words
- Must be related to {base_prompt}
- Should be similar to these examples:
{example_queries}

Generate a question:"""
    
    examples = []
    while len(examples) < num_examples:
        try:
            result = generator(
                prompt,
                max_new_tokens=50,
                num_return_sequences=1,
                do_sample=True,
                temperature=0.7
            )
            generated = result[0]['generated_text']
            if len(generated.split()) <= 10 and '?' in generated:
                examples.append(generated)
        except Exception as e:
            print(f"Error: {e}")
            continue
    return examples


# Example prompts with sample queries
class_examples = {
    'Monthly installment': {
        'prompt': " Equated Monthly Installment EMI and payment options for mattress purchase",
        'examples': """
- Do you have EMI options?
- I want in installment
- Down payments"""
    },
    'COD': {
        'prompt': "Cash on Delivery or COD payment for mattress",
        'examples': """
- Is COD available?
- Do you accept cash on delivery?
- Can I pay later on delivery """
    },
    'ORTHO_FEATURES': {
        'prompt': "Orthopedic features and benefits of mattress",
        'examples': """
- Is it good for back pain?
- What orthopedic features does it have?
- Does it support spine alignment?"""
    },
    'ERGO_FEATURES': {
        'prompt': "Ergonomic features and comfort of mattress",
        'examples': """
- What ergonomic features are included?
- How does it support body posture?
- Tell me about comfort features"""
    },
    'COMPARISON': {
        'prompt': "Compare different mattress models",
        'examples': """
- Which available model is better?
- Compare soft vs firm mattress
- What's the difference between your available models?"""
    },
    'WARRANTY': {
        'prompt': "Warranty terms and coverage for mattress",
        'examples': """
- What's the warranty period?
- What's covered in warranty?
- How long is the warranty valid?"""
    },
    '100_NIGHT_TRIAL_OFFER': {
        'prompt': "100-night trial period for mattress",
        'examples': """
- How does trial period work?
- Can I return during trial?
- What's the trial period policy?"""
    },
    'SIZE_CUSTOMIZATION': {
        'prompt': "Custom size options for mattress",
        'examples': """
- Can you make custom size?
- Is size customization possible?
- Do you make special sizes?"""
    },
    'WHAT_SIZE_TO_ORDER': {
        'prompt': "Help choosing mattress size",
        'examples': """
- Which size should I buy?
- What size fits my bed?
- Recommend size for couple?"""
    },
    'LEAD_GEN': {
        'prompt': "Request for mattress information and contact",
        'examples': """
- Please contact me about details
- Need more information
- Request callback for details"""
    },
    'CHECK_PINCODE': {
        'prompt': "Delivery availability check by pincode address",
        'examples': """
- Do you deliver to my pincode?
- Check delivery availability
- Is delivery possible here?"""
    },
    'DISTRIBUTORS': {
        'prompt': "Find nearby stores and distributors",
        'examples': """
- Where is your nearest store?
- Dealer in my city?
- Local distributor contact?"""
    },
    'MATTRESS_COST': {
        'prompt': "Price and cost information for mattress",
        'examples': """
- What's the price?
- How much for queen size?
- Cost of single mattress?"""
    },
    'PRODUCT_VARIANTS': {
        'prompt': "Different types and variants of mattresses",
        'examples': """
- What types are available?
- Different variants you have?
- Show all mattress options"""
    },
    'ABOUT_SOF_MATTRESS': {
        'prompt': "Information about SOF mattress company",
        'examples': """
- Tell me about your company
- Company background details?
- About SOF mattress brand?"""
    },
    'DELAY_IN_DELIVERY': {
        'prompt': "Delayed delivery inquiries",
        'examples': """
- Why is delivery delayed?
- When will order arrive?
- Reason for delivery delay?"""
    },
    'ORDER_STATUS': {
        'prompt': "Check status of mattress order",
        'examples': """
- Where is my order?
- Track order status
- Current delivery status?"""
    },
    'RETURN_EXCHANGE': {
        'prompt': "Return and exchange policies",
        'examples': """
- How to return mattress?
- What's the return policy?
- Exchange possible?"""
    },
    'CANCEL_ORDER': {
        'prompt': "Cancel mattress order",
        'examples': """
- How to cancel order?
- Cancel my booking
- Order cancellation process?"""
    },
    'PILLOWS': {
        'prompt': "Information about pillows",
        'examples': """
- Do you sell pillows?
- Pillow types available?
- Price of pillows?"""
    },
    'OFFERS': {
        'prompt': "Current offers and discounts",
        'examples': """
- Any current offers?
- What discounts available?
- Ongoing promotional deals?"""
    }
}


# Generate data
generated_data = []
for label, content in class_examples.items():
    print(f"\nGenerating for {label}...")
    examples = generate_examples(content['prompt'], content['examples'])
    
    for example in examples:
        generated_data.append({
            'sentence': example,
            'label': label
        })

# Save results
df = pd.DataFrame(generated_data)
df.to_csv('llm_generated_queries.csv', index=False)

# Print statistics
print(f"\nTotal generated queries: {len(df)}")
print("\nQueries per label:")
print(df['label'].value_counts())


loading configuration file config.json from cache at /home/manasa/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transforme


Generating for Monthly installment...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 

Logistic Regression on augmented data

In [6]:
df_augmented = pd.read_csv('augmented_training_data.csv')

print(df_augmented)

                            sentence   label
0        Do you provide EMI options?     EMI
1    What are the EMI payment terms?     EMI
2    Is zero interest EMI available?     EMI
3              EMI duration options?     EMI
4       Monthly EMI payment details?     EMI
..                               ...     ...
310              Promotional offers?  OFFERS
311           Any schemes available?  OFFERS
312                   Current deals?  OFFERS
313                Discount seasons?  OFFERS
314            Offer period details?  OFFERS

[315 rows x 2 columns]


In [9]:
df_original = pd.read_csv('sofmattress_train.csv')

print(df_original)

                                           sentence   label
0                      You guys provide EMI option?     EMI
1    Do you offer Zero Percent EMI payment options?     EMI
2                                           0% EMI.     EMI
3                                               EMI     EMI
4                             I want in installment     EMI
..                                              ...     ...
323              May I please know about the offers  OFFERS
324                                Available offers  OFFERS
325                              Is offer available  OFFERS
326                      Want to know the discount   OFFERS
327                 Tell me about the latest offers  OFFERS

[328 rows x 2 columns]


# Combine datasets

In [11]:
df_combined = pd.concat([df_original, df_augmented], ignore_index=True)
print(df_combined)

                                           sentence   label
0                      You guys provide EMI option?     EMI
1    Do you offer Zero Percent EMI payment options?     EMI
2                                           0% EMI.     EMI
3                                               EMI     EMI
4                             I want in installment     EMI
..                                              ...     ...
638                             Promotional offers?  OFFERS
639                          Any schemes available?  OFFERS
640                                  Current deals?  OFFERS
641                               Discount seasons?  OFFERS
642                           Offer period details?  OFFERS

[643 rows x 2 columns]


In [18]:
# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

df_combined['clean_sentence'] = df_combined['sentence'].apply(preprocess_text)

# Prepare for modeling
X = df_combined['clean_sentence']
y = df_combined['label']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# Train model
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      1.00      1.00         5
   ABOUT_SOF_MATTRESS       0.75      0.75      0.75         4
         CANCEL_ORDER       1.00      1.00      1.00         8
        CHECK_PINCODE       1.00      1.00      1.00         4
                  COD       1.00      1.00      1.00         3
           COMPARISON       0.75      0.75      0.75         4
    DELAY_IN_DELIVERY       0.75      1.00      0.86         3
         DISTRIBUTORS       0.73      1.00      0.84         8
                  EMI       1.00      1.00      1.00         6
        ERGO_FEATURES       0.71      0.71      0.71         7
             LEAD_GEN       0.60      0.50      0.55         6
        MATTRESS_COST       1.00      0.90      0.95        10
               OFFERS       1.00      1.00      1.00         9
         ORDER_STATUS       0.88      0.78      0.82         9
       ORTHO_FEATURES       0.83      0.71      0.77  

In [21]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join(text.split())

# Load and preprocess your data
df = pd.DataFrame(df_combined)  # Replace with your data loading method
df['clean_sentence'] = df['sentence'].apply(preprocess_text)

# Prepare features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_sentence'])
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Random Forest Implementation
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# SVM Implementation
svm_model = SVC(
    kernel='linear',
    probability=True,
    class_weight='balanced',
    random_state=42,
    C=1.0
)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Prediction function for both models
def predict_intent(text, model, vectorizer):
    cleaned_text = preprocess_text(text)
    text_vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vectorized)
    probabilities = model.predict_proba(text_vectorized)
    confidence = np.max(probabilities) * 100
    return prediction[0], confidence

# Print classification reports
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# Test examples
test_examples = [
    "Do you have EMI options?",
    "What are the current offers?",
    "I want to know about warranty",
    "How can I cancel my order?",
    "What is the cost of mattress?"
]

# Test both models
print("\nRandom Forest Predictions:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, rf_model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)

print("\nSVM Predictions:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, svm_model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)


Random Forest Classification Report:
                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       0.60      0.60      0.60         5
   ABOUT_SOF_MATTRESS       0.40      0.50      0.44         4
         CANCEL_ORDER       1.00      0.25      0.40         8
        CHECK_PINCODE       0.25      0.25      0.25         4
                  COD       0.50      0.33      0.40         3
           COMPARISON       1.00      0.25      0.40         4
    DELAY_IN_DELIVERY       0.40      0.67      0.50         3
         DISTRIBUTORS       0.35      0.88      0.50         8
                  EMI       0.80      0.67      0.73         6
        ERGO_FEATURES       0.80      0.57      0.67         7
             LEAD_GEN       0.67      0.67      0.67         6
        MATTRESS_COST       0.46      0.60      0.52        10
               OFFERS       0.86      0.67      0.75         9
         ORDER_STATUS       0.50      0.44      0.47         9
       ORTHO_FEAT