In [14]:
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

1st part- generation of the dataset using templates

In [15]:
good_templates = [
    "This product is {adj1} and {adj2}.",
    "Absolutely {adj1}! Highly recommend it.",
    "Very {adj1} performance, I'm {adj2} with it.",
    "A {adj1} purchase, truly {adj2}.",
    "Works {adj1}! So {adj2}.",
    "I'm {adj1} with the quality, it's very {adj2}.",
    "Excellent value and {adj1} features.",
    "The customer service was {adj1} and the item arrived {adj2}.",
    "Simply {adj1}, exceeds expectations.",
    "Totally {adj1}, would buy again."
]

bad_templates = [
    "This product is {adj1} and {adj2}.",
    "Absolutely {adj1}! Do not recommend it.",
    "Very {adj1} performance, I'm {adj2} with it.",
    "A {adj1} purchase, truly {adj2}.",
    "Doesn't work {adj1}! So {adj2}.",
    "I'm {adj1} with the quality, it's very {adj2}.",
    "Poor value and {adj1} features.",
    "The customer service was {adj1} and the item arrived {adj2}.",
    "Simply {adj1}, far below expectations.",
    "Totally {adj1}, would not buy again."
]

good_adjectives = [
    "great", "excellent", "superb", "fantastic", "amazing",
    "satisfied", "pleased", "happy", "reliable", "efficient",
    "durable", "user-friendly", "top-notch", "smooth", "perfect"
]

bad_adjectives = [
    "terrible", "horrible", "bad", "awful", "disappointing",
    "frustrating", "broken", "unreliable", "slow", "complicated",
    "cheap", "useless", "defective", "rough", "flawed"
]

In [16]:
def generate_feedback(label_type):
    """Generates a single product feedback review based on label type."""
    if label_type == "good":
        template = random.choice(good_templates)
        adj1 = random.choice(good_adjectives)
        adj2 = random.choice(good_adjectives)
    else: # label_type == "bad"
        template = random.choice(bad_templates)
        adj1 = random.choice(bad_adjectives)
        adj2 = random.choice(bad_adjectives)
    return template.format(adj1=adj1, adj2=adj2)

feedback_data = []
# Generate 50 'good' feedback samples
for _ in range(50):
    feedback_data.append({"text": generate_feedback("good"), "label": "good"})
# Generate 50 'bad' feedback samples
for _ in range(50):
    feedback_data.append({"text": generate_feedback("bad"), "label": "bad"})

# Shuffle the combined list
random.shuffle(feedback_data)
df = pd.DataFrame(feedback_data)
print("--- Synthetic Product Feedback Dataset Created ---")
print(df)
print("\nLabel Distribution:")
print(df['label'].value_counts())

--- Synthetic Product Feedback Dataset Created ---
                                                text label
0             Simply reliable, exceeds expectations.  good
1                  Totally durable, would buy again.  good
2              Simply pleased, exceeds expectations.  good
3   I'm excellent with the quality, it's very great.  good
4                A broken purchase, truly defective.   bad
..                                               ...   ...
95       Simply frustrating, far below expectations.   bad
96             Excellent value and amazing features.  good
97           Simply efficient, exceeds expectations.  good
98                       Doesn't work awful! So bad.   bad
99                      Works top-notch! So amazing.  good

[100 rows x 2 columns]

Label Distribution:
label
good    50
bad     50
Name: count, dtype: int64


vectorizing the dataset as per question i.e (300 max features)

In [17]:
vectorizer = TfidfVectorizer(max_features=300, lowercase=True, stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']

2nd part :

splitting the dataset into 75% training sample and 25% testing sample 

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



3 rd Part:

In [19]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='good') # Specify 'good' as the positive class
recall = recall_score(y_test, y_pred, pos_label='good')
f1 = f1_score(y_test, y_pred, pos_label='good')

print("\n--- Logistic Regression Model Training Complete ---")
print(f"Accuracy on the test set: {accuracy*100:.2f}%")
print(f"Precision (for 'good' label): {precision*100:.2f}%")
print(f"Recall (for 'good' label): {recall:.2f}")
print(f"F1-Score (for 'good' label): {f1:.2f}")


--- Logistic Regression Model Training Complete ---
Accuracy on the test set: 100.00%
Precision (for 'good' label): 100.00%
Recall (for 'good' label): 1.00
F1-Score (for 'good' label): 1.00


4th Part

using set of manual example to check the acuracy of model

In [20]:
def text_preprocess_vectorize(texts, fitted_vectorizer):
    vectorized_matrix = fitted_vectorizer.transform(texts)
    return vectorized_matrix

print("\n--- Testing the text_preprocess_vectorize function and prediction ---")

# Example texts to test the function
new_texts = [
    "This is an amazing product, I love it!",            # 'good'
    "The battery life is terrible and it's very slow.", #  'bad'
    "It's okay, not great but not bad either.",         # Ambiguous, model will predict based on learned patterns
    "Wonderful customer support and fast delivery."      #  'good'
]

new_texts_vectorized = text_preprocess_vectorize(new_texts, vectorizer)

new_predictions = model.predict(new_texts_vectorized)

for i, text in enumerate(new_texts):
    print(f"Text: '{text}' -> Predicted Label: {new_predictions[i]}")


--- Testing the text_preprocess_vectorize function and prediction ---
Text: 'This is an amazing product, I love it!' -> Predicted Label: good
Text: 'The battery life is terrible and it's very slow.' -> Predicted Label: bad
Text: 'It's okay, not great but not bad either.' -> Predicted Label: good
Text: 'Wonderful customer support and fast delivery.' -> Predicted Label: good
