In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



nltk.download('wordnet')
nltk.download('punkt')
# nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fast\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fast\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import pandas as pd
df=pd.read_csv("final_data.csv")
df.head()

Unnamed: 0,sentence,label
0,Have you seen the dog water bottle,pointing
1,Is the doorbell ringing,pointing
2,Bravo thermostat set right,good sign
3,Good night sir assistant,hello sign
4,Perfect AC set for night,good sign


In [10]:
df.columns

Index(['sentence', 'label'], dtype='object')

In [11]:
df["label"].value_counts()

label
pointing      400
good sign     400
hello sign    400
Name: count, dtype: int64

In [12]:
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  # For lemmatization
# from nltk.stem import PorterStemmer    # Only if you want stemming

def preprocess_text(text):
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in text]
    
    return lemmatized_tokens

In [13]:
df['sentence_preprocessed']=df['sentence'].apply(preprocess_text)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y_labels=encoder.fit_transform(df['label'])
xtrain, xtest, ytrain, ytest = train_test_split(df['sentence_preprocessed'],y_labels, test_size=0.2, random_state=42)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
xtrain_tfidf = vectorizer.fit_transform(xtrain)
xtest_tfidf = vectorizer.transform(xtest)



In [16]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
# -----------------------------
# 3. Train LinearSVC
# -----------------------------
clf = LinearSVC(penalty="l2", loss="squared_hinge", dual=True)
clf.fit(xtrain_tfidf.toarray(), ytrain)

# -----------------------------
# 4. Calibrate LinearSVC for probabilities
# -----------------------------
calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=3)  # cv can be 3 or 5
calibrated_clf.fit(xtrain_tfidf.toarray(), ytrain)

In [17]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = calibrated_clf.predict(xtest_tfidf.toarray())
accuracy = accuracy_score(ytest, y_pred)
print(f"Accuracy of calibrated SVM: {accuracy * 100:.2f}%")
print(classification_report(ytest, y_pred))

Accuracy of calibrated SVM: 97.92%
              precision    recall  f1-score   support

           0       0.99      0.96      0.97        75
           1       0.99      0.99      0.99        71
           2       0.97      0.99      0.98        94

    accuracy                           0.98       240
   macro avg       0.98      0.98      0.98       240
weighted avg       0.98      0.98      0.98       240



In [18]:
import numpy as np

def predict_texts(texts, vectorizer, model, encoder):
    # Preprocess all texts
    processed_texts = [preprocess_text(text) for text in texts]
    # Transform using the same vectorizer
    X_tfidf = vectorizer.transform(processed_texts)
    
    # Predict classes
    predictions = model.predict(X_tfidf.toarray())
    predictions = encoder.inverse_transform(predictions)
    
    # Predict probabilities
    probabilities = model.predict_proba(X_tfidf.toarray())
    
    # Combine predictions and probabilities nicely
    results = []
    for i, text in enumerate(texts):
        class_probs = dict(zip(encoder.classes_, probabilities[i]))
        results.append({
            "text": text,
            "prediction": predictions[i],
            "probabilities": class_probs
        })
    
    return results

# Example usage
sample_texts = ["OPEN THE DOOR", "Set an alarm for 7 AM tomorrow","hi"]
results = predict_texts(sample_texts, vectorizer, calibrated_clf, encoder)

for res in results:
    print(res)


{'text': 'OPEN THE DOOR', 'prediction': 'good sign', 'probabilities': {'good sign': 0.9715139867542425, 'hello sign': 0.028486010511440743, 'pointing': 2.734316728783928e-09}}
{'text': 'Set an alarm for 7 AM tomorrow', 'prediction': 'good sign', 'probabilities': {'good sign': 0.997348361842814, 'hello sign': 0.002484793147298271, 'pointing': 0.00016684500988784132}}
{'text': 'hi', 'prediction': 'hello sign', 'probabilities': {'good sign': 0.05105365982675739, 'hello sign': 0.9488232286943027, 'pointing': 0.00012311147893978206}}


In [19]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
# -----------------------------
# 3. Train LinearSVC
# -----------------------------
clf = LinearSVC(penalty="l2", loss="squared_hinge", dual=True)
clf.fit(xtrain_tfidf.toarray(), ytrain)

In [20]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = clf.predict(xtest_tfidf.toarray())
accuracy = accuracy_score(ytest, y_pred)
print(f"Accuracy OF SVM: {accuracy * 100:.2f}%")
print(classification_report(ytest, y_pred))

Accuracy OF SVM: 98.33%
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        75
           1       0.99      0.99      0.99        71
           2       0.97      1.00      0.98        94

    accuracy                           0.98       240
   macro avg       0.98      0.98      0.98       240
weighted avg       0.98      0.98      0.98       240



In [21]:
from sklearn.ensemble import RandomForestClassifier
CLF_Randomforest=RandomForestClassifier(n_estimators=100,random_state=42)
CLF_Randomforest.fit(xtrain_tfidf.toarray(), ytrain)


In [22]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = CLF_Randomforest.predict(xtest_tfidf.toarray())
accuracy = accuracy_score(ytest, y_pred)
print(f"Accuracy OF Randomforest: {accuracy * 100:.2f}%")
print(classification_report(ytest, y_pred))

Accuracy OF Randomforest: 97.08%
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        75
           1       1.00      0.99      0.99        71
           2       0.97      0.96      0.96        94

    accuracy                           0.97       240
   macro avg       0.97      0.97      0.97       240
weighted avg       0.97      0.97      0.97       240



In [23]:
print("Train Accuracy of Calibrated SVM:", calibrated_clf.score(xtrain_tfidf.toarray(), ytrain))
print("Test Accuracy of Calibrated SVM:", calibrated_clf.score(xtest_tfidf.toarray(), ytest))

Train Accuracy of Calibrated SVM: 0.9885416666666667
Test Accuracy of Calibrated SVM: 0.9791666666666666


In [24]:
print("Train Accuracy of SVM:", clf.score(xtrain_tfidf.toarray(), ytrain))
print("Test Accuracy of SVM:", clf.score(xtest_tfidf.toarray(), ytest))

Train Accuracy of SVM: 0.9885416666666667
Test Accuracy of SVM: 0.9833333333333333


In [25]:
print("Train Accuracy of Randomforest:", CLF_Randomforest.score(xtrain_tfidf.toarray(), ytrain))
print("Test Accuracy of Randomforest:", CLF_Randomforest.score(xtest_tfidf.toarray(), ytest))

Train Accuracy of Randomforest: 1.0
Test Accuracy of Randomforest: 0.9708333333333333


In [26]:
import joblib

# Fix the lambda problem → replace any lambda with a real function
def dummy(x):
    return x

vectorizer.preprocessor = dummy
vectorizer.tokenizer = dummy
vectorizer.analyzer = dummy   # if you used lambda there too

# Now save – it will work 100%
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(calibrated_clf, 'calibrated_clf.joblib')
joblib.dump(encoder,'encoder.joblib')

['encoder.joblib']