# loading csv file and Data Preparation

In [22]:
import pandas as pd
import numpy as np
df = pd.read_csv('sofmattress_train.csv', sep='\t')
df[['sentence', 'label']] = df.iloc[:, 0].str.split(',', expand=True)
print(df)

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text


df['clean_sentence'] = df['sentence'].apply(preprocess_text)
print(df['clean_sentence'])
    

                                        sentence,label  \
0                     You guys provide EMI option?,EMI   
1    Do you offer Zero Percent EMI payment options?...   
2                                          0% EMI.,EMI   
3                                              EMI,EMI   
4                            I want in installment,EMI   
..                                                 ...   
323          May I please know about the offers,OFFERS   
324                            Available offers,OFFERS   
325                          Is offer available,OFFERS   
326                  Want to know the discount ,OFFERS   
327             Tell me about the latest offers,OFFERS   

                                           sentence   label  
0                      You guys provide EMI option?     EMI  
1    Do you offer Zero Percent EMI payment options?     EMI  
2                                           0% EMI.     EMI  
3                                               EMI    

# Data Split, training and Evaluate

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


X = df['clean_sentence']
y = df['label']


vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      0.75      0.86         4
   ABOUT_SOF_MATTRESS       0.60      1.00      0.75         3
         CANCEL_ORDER       0.67      1.00      0.80         2
        CHECK_PINCODE       1.00      1.00      1.00         1
                  COD       0.50      1.00      0.67         2
           COMPARISON       0.50      1.00      0.67         1
    DELAY_IN_DELIVERY       0.00      0.00      0.00         2
         DISTRIBUTORS       0.75      0.75      0.75         8
                  EMI       1.00      0.80      0.89         5
        ERGO_FEATURES       1.00      0.75      0.86         4
             LEAD_GEN       1.00      0.75      0.86         4
        MATTRESS_COST       1.00      1.00      1.00         3
               OFFERS       1.00      0.67      0.80         3
         ORDER_STATUS       0.50      1.00      0.67         1
       ORTHO_FEATURES       1.00      1.00      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Model Testing

In [17]:
# Function to predict single input
def predict_intent(text, model, vectorizer):
    # Clean the input text using the same cleaning function used during training
    cleaned_text = preprocess_text(text)
    
    # Transform the text using the same vectorizer
    text_vectorized = vectorizer.transform([cleaned_text])
    
    # Get prediction
    prediction = model.predict(text_vectorized)
    
    # Get prediction probabilities
    probabilities = model.predict_proba(text_vectorized)
    confidence = np.max(probabilities) * 100
    
    return prediction[0], confidence

# Test with different examples
test_examples = [
    "Do you have EMI options?",
    "What are the current offers?",
    "I want to know about warranty",
    "How can I cancel my order?",
    "What is the cost of mattress?"
]

# Print predictions
print("\nPredictions for test examples:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)

# Interactive testing
while True:
    user_input = input("\nEnter your text (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    
    intent, confidence = predict_intent(user_input, model, vectorizer)
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")



Predictions for test examples:
--------------------------------------------------
Text: Do you have EMI options?
Predicted Intent: EMI
Confidence: 16.53%
--------------------------------------------------
Text: What are the current offers?
Predicted Intent: OFFERS
Confidence: 22.37%
--------------------------------------------------
Text: I want to know about warranty
Predicted Intent: WARRANTY
Confidence: 23.81%
--------------------------------------------------
Text: How can I cancel my order?
Predicted Intent: CANCEL_ORDER
Confidence: 43.80%
--------------------------------------------------
Text: What is the cost of mattress?
Predicted Intent: MATTRESS_COST
Confidence: 26.04%
--------------------------------------------------


In [5]:
# Get unique values from column E
unique_values = df['label'].unique()

# Display the unique values
print(unique_values)

['EMI' 'COD' 'ORTHO_FEATURES' 'ERGO_FEATURES' 'COMPARISON' 'WARRANTY'
 '100_NIGHT_TRIAL_OFFER' 'SIZE_CUSTOMIZATION' 'WHAT_SIZE_TO_ORDER'
 'LEAD_GEN' 'CHECK_PINCODE' 'DISTRIBUTORS' 'MATTRESS_COST'
 'PRODUCT_VARIANTS' 'ABOUT_SOF_MATTRESS' 'DELAY_IN_DELIVERY'
 'ORDER_STATUS' 'RETURN_EXCHANGE' 'CANCEL_ORDER' 'PILLOWS' 'OFFERS']


# Retry Logistic Regression with augmented data from llm

In [6]:
df_augmented = pd.read_csv('augmented_training_data.csv')

print(df_augmented)

                            sentence   label
0        Do you provide EMI options?     EMI
1    What are the EMI payment terms?     EMI
2    Is zero interest EMI available?     EMI
3              EMI duration options?     EMI
4       Monthly EMI payment details?     EMI
..                               ...     ...
310              Promotional offers?  OFFERS
311           Any schemes available?  OFFERS
312                   Current deals?  OFFERS
313                Discount seasons?  OFFERS
314            Offer period details?  OFFERS

[315 rows x 2 columns]


In [9]:
df_original = pd.read_csv('sofmattress_train.csv')

print(df_original)

                                           sentence   label
0                      You guys provide EMI option?     EMI
1    Do you offer Zero Percent EMI payment options?     EMI
2                                           0% EMI.     EMI
3                                               EMI     EMI
4                             I want in installment     EMI
..                                              ...     ...
323              May I please know about the offers  OFFERS
324                                Available offers  OFFERS
325                              Is offer available  OFFERS
326                      Want to know the discount   OFFERS
327                 Tell me about the latest offers  OFFERS

[328 rows x 2 columns]


# Combine datasets both datasets

In [11]:
df_combined = pd.concat([df_original, df_augmented], ignore_index=True)
print(df_combined)

                                           sentence   label
0                      You guys provide EMI option?     EMI
1    Do you offer Zero Percent EMI payment options?     EMI
2                                           0% EMI.     EMI
3                                               EMI     EMI
4                             I want in installment     EMI
..                                              ...     ...
638                             Promotional offers?  OFFERS
639                          Any schemes available?  OFFERS
640                                  Current deals?  OFFERS
641                               Discount seasons?  OFFERS
642                           Offer period details?  OFFERS

[643 rows x 2 columns]


# **Logistic Regression results**

In [18]:
# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

df_combined['clean_sentence'] = df_combined['sentence'].apply(preprocess_text)

# Prepare for modeling
X = df_combined['clean_sentence']
y = df_combined['label']

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# Train model
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      1.00      1.00         5
   ABOUT_SOF_MATTRESS       0.75      0.75      0.75         4
         CANCEL_ORDER       1.00      1.00      1.00         8
        CHECK_PINCODE       1.00      1.00      1.00         4
                  COD       1.00      1.00      1.00         3
           COMPARISON       0.75      0.75      0.75         4
    DELAY_IN_DELIVERY       0.75      1.00      0.86         3
         DISTRIBUTORS       0.73      1.00      0.84         8
                  EMI       1.00      1.00      1.00         6
        ERGO_FEATURES       0.71      0.71      0.71         7
             LEAD_GEN       0.60      0.50      0.55         6
        MATTRESS_COST       1.00      0.90      0.95        10
               OFFERS       1.00      1.00      1.00         9
         ORDER_STATUS       0.88      0.78      0.82         9
       ORTHO_FEATURES       0.83      0.71      0.77  

# SVM and Random Forest model evaluation

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join(text.split())

# Load and preprocess your data
df = pd.DataFrame(df_combined)  # Replace with your data loading method
df['clean_sentence'] = df['sentence'].apply(preprocess_text)

# Prepare features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_sentence'])
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Random Forest Implementation
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# SVM Implementation
svm_model = SVC(
    kernel='linear',
    probability=True,
    class_weight='balanced',
    random_state=42,
    C=1.0
)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Prediction function for both models
def predict_intent(text, model, vectorizer):
    cleaned_text = preprocess_text(text)
    text_vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vectorized)
    probabilities = model.predict_proba(text_vectorized)
    confidence = np.max(probabilities) * 100
    return prediction[0], confidence

# Print classification reports
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# Test examples
test_examples = [
    "Do you have EMI options?",
    "What are the current offers?",
    "I want to know about warranty",
    "How can I cancel my order?",
    "What is the cost of mattress?"
]

# Test both models
print("\nRandom Forest Predictions:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, rf_model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)

print("\nSVM Predictions:")
print("-" * 50)
for text in test_examples:
    intent, confidence = predict_intent(text, svm_model, vectorizer)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.2f}%")
    print("-" * 50)


Random Forest Classification Report:
                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       0.60      0.60      0.60         5
   ABOUT_SOF_MATTRESS       0.40      0.50      0.44         4
         CANCEL_ORDER       1.00      0.25      0.40         8
        CHECK_PINCODE       0.25      0.25      0.25         4
                  COD       0.50      0.33      0.40         3
           COMPARISON       1.00      0.25      0.40         4
    DELAY_IN_DELIVERY       0.40      0.67      0.50         3
         DISTRIBUTORS       0.35      0.88      0.50         8
                  EMI       0.80      0.67      0.73         6
        ERGO_FEATURES       0.80      0.57      0.67         7
             LEAD_GEN       0.67      0.67      0.67         6
        MATTRESS_COST       0.46      0.60      0.52        10
               OFFERS       0.86      0.67      0.75         9
         ORDER_STATUS       0.50      0.44      0.47         9
       ORTHO_FEAT