In [None]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import string


In [None]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("vishweshsalodkar/customer-feedback-dataset")

print("Path to dataset files:", path)

# Find the CSV file in the downloaded folder
for file in os.listdir(path):
    if file.endswith(".csv"):
        csv_file = os.path.join(path, file)
        break

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

print(df.head())


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/jasmikawadhwa/.cache/kagglehub/datasets/vishweshsalodkar/customer-feedback-dataset/versions/1
  Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score
0  "I love this product!", Positive, Twitter, 202...                     
1  "The service was terrible.", Negative, Yelp Re...                     
2  "This movie is amazing!", Positive, IMDb, 2023...                     
3  "I'm so disappointed with their customer suppo...                     
4  "Just had the best meal of my life!", Positive...                     


In [6]:
df.columns.str.strip()

Index(['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'], dtype='object')

In [19]:
def preprocess_text(review):
    review = review.lower()
    review = re.sub(r'https?://\S+|www\.\S+', '', review)  # remove URLs
    review = re.sub(r'[^A-Za-z0-9\s]', '', review)  # remove punctuation
    review = re.sub(r'\s+', ' ', review).strip()  # remove extra whitespaces
    return review
df['cleaned_review'] = df['Text'].astype(str).apply(preprocess_text)



In [23]:
print("NaNs in cleaned_review:", df['cleaned_review'].isna().sum())
print("NaNs in Sentiment:", df['Sentiment'].isna().sum())


NaNs in cleaned_review: 0
NaNs in Sentiment: 2


In [26]:
print("NaN count in Sentiment:", df['Sentiment'].isna().sum())
print("Blank/Empty strings in Sentiment:", (df['Sentiment'].str.strip() == '').sum())


NaN count in Sentiment: 0
Blank/Empty strings in Sentiment: 0


In [27]:
df = df.reset_index(drop=True)


In [28]:
print(df['Sentiment'].value_counts())


Sentiment
Positive     53
Negative     43
Sentiment     1
Name: count, dtype: int64


In [None]:
# Get the count of each Sentiment class
sentiment_counts = df['Sentiment'].value_counts()

# Show the counts 
print("Sentiment class counts:\n", sentiment_counts)

# Keep only those classes that appear more than once
valid_classes = sentiment_counts[sentiment_counts > 1].index.tolist()

df = df[df['Sentiment'].isin(valid_classes)]

# Reset index 
df = df.reset_index(drop=True)


Sentiment class counts:
 Sentiment
Positive     53
Negative     43
Sentiment     1
Name: count, dtype: int64


In [30]:
X = df['cleaned_review']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
df.columns.str.strip()

Index(['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'], dtype='object')

In [None]:
X = df['cleaned_review']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
with open(csv_file, 'r') as f:
    print(f.readline())


"Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score"



In [13]:
df = pd.read_csv(csv_file, sep=",", quotechar='"')
df = pd.read_csv(csv_file, header=None)


In [12]:
print(df.columns.tolist())


['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score']


In [14]:
print(df.head())


                                                   0
0  Text, Sentiment, Source, Date/Time, User ID, L...
1  "I love this product!", Positive, Twitter, 202...
2  "The service was terrible.", Negative, Yelp Re...
3  "This movie is amazing!", Positive, IMDb, 2023...
4  "I'm so disappointed with their customer suppo...


In [15]:
df = df[0].str.split(",", expand=True)


In [16]:
df.columns = ['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location', 'Confidence Score']


In [17]:
df['cleaned_review'] = df['Text'].astype(str).apply(preprocess_text)


In [8]:
df = pd.read_csv(csv_file, skipinitialspace=True)
df.columns = df.columns.str.strip()  # remove leading/trailing whitespace
print(df.columns)


Index(['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'], dtype='object')


In [33]:
def preprocess_text(text):  
  text = text.lower()  
  text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove URLs text = re.sub(r'[^A-Za-z0-9\s]', '',   text) # remove punctuation 
  text = re.sub(r'\s+', ' ', text).strip() # remove extra whitespaces 
  return text 
df['cleaned_review'] = df['Text'].astype(str).apply(preprocess_text) 

In [35]:
x=df['cleaned_review']
y=df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2,random_state=42,stratify=y
)

In [37]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)  # transforms test data using the same fitted vectorizer


In [38]:
model = LogisticRegression(max_iter=1000) 
model.fit(X_train_tfidf, y_train)

In [39]:
y_pred = model.predict(X_test_tfidf)

In [40]:
print("Accuracy:", accuracy_score(y_test, y_pred)) 
print("\nClassification Report:\n", classification_report(y_test, y_pred)) 
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.95

Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      0.89      0.94         9
    Positive       0.92      1.00      0.96        11

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20


Confusion Matrix:
 [[ 8  1]
 [ 0 11]]


In [41]:
test_examples = [
"I loved the product! Great experience.", "Worst service ever. Very disappointed."
]
test_examples_clean = [preprocess_text(x) for x in test_examples] 
test_examples_tfidf = tfidf.transform(test_examples_clean)
pred_examples = model.predict(test_examples_tfidf)
for review, pred in zip(test_examples, pred_examples): print(f"Review: {review}\nPredicted Sentiment: {pred}\n")


Review: I loved the product! Great experience.
Predicted Sentiment:  Positive

Review: Worst service ever. Very disappointed.
Predicted Sentiment:  Negative

