In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [10]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shail\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Load the dataset
df = pd.read_csv("feedback.csv", encoding="iso-8859-1")

In [12]:
df.head()

Unnamed: 0,Reviews,Feedback
0,teacher are punctual but they should also give...,3
1,University teaching here is very much dependen...,3
2,"Talk about punctuality, it's really good but t...",3
3,Punctuality is good. Most of the teachers are ...,3
4,"Interaction is very good,accurate supplement m...",3


In [13]:
def preprocess(text):
    # Remove unnecessary characters
    text = text.replace("<br />", " ")
    # Convert to lowercase
    text = text.lower()
    # Remove stop words and stem words
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

In [14]:
df["Reviews"] = df["Reviews"].apply(preprocess)

In [15]:
df["Reviews"].head()

0    teacher punctual also give us practic knowledg...
1    univers teach much depend upon slides,though e...
2    talk punctuality, realli good lectur deliveri ...
3            punctual good. teacher interact student .
4    interact good,accur supplement materi provid r...
Name: Reviews, dtype: object

In [16]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["Reviews"], df["Feedback"], test_size=0.2, random_state=42)


In [17]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [18]:
print(X_test_vect)

  (0, 4141)	0.21319907113011285
  (0, 4137)	0.1996988784832687
  (0, 4066)	0.16847430785386142
  (0, 3900)	0.30742343491568025
  (0, 3876)	0.11854077477861863
  (0, 3873)	0.25603757605439065
  (0, 3806)	0.09049251406009626
  (0, 3753)	0.09156001995122072
  (0, 3681)	0.150823052439513
  (0, 3659)	0.20751006334185337
  (0, 3424)	0.21999554143486674
  (0, 3379)	0.16905291939053302
  (0, 3227)	0.17023965002008515
  (0, 3082)	0.1646715584175756
  (0, 2396)	0.14145281103586763
  (0, 2365)	0.17406219318094052
  (0, 2279)	0.12171213730754792
  (0, 2108)	0.13493826333658332
  (0, 1940)	0.2354619857228718
  (0, 1522)	0.19573178507859812
  (0, 1490)	0.16416052450993895
  (0, 1295)	0.2739191296741922
  (0, 942)	0.17474192717879247
  (0, 847)	0.18893531477384423
  (0, 773)	0.19630153686064136
  :	:
  (689, 3753)	0.19830379704240023
  (689, 3745)	0.19750449301997577
  (689, 3674)	0.17361377977823322
  (689, 3114)	0.24163430979811248
  (689, 2806)	0.1794572814870563
  (689, 2765)	0.1660515917957274
 

In [19]:
# Train the model using LinearSVC
model = LinearSVC()
model.fit(X_train_vect, y_train)



In [20]:
# Evaluate the model on the testing set
y_pred = model.predict(X_test_vect)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.5224312590448625
Precision: 0.4977553099654219
Recall: 0.5224312590448625
F1-score: 0.5060354056557651


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Use the trained model to predict the feedback value of external reviews
external_review = "Greatest Trip Ever"
external_review_vect = vectorizer.transform([preprocess(external_review)])
external_feedback = model.predict(external_review_vect)[0]
print("Feedback value for external review:", external_feedback)

NameError: name 'vectorizer' is not defined

In [32]:
import pickle

# Save the model to a file
with open("feedback_predict.pkl", "wb") as f:
    pickle.dump(model, f)
with open("feedback_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

Manual Testing

In [2]:
import pickle

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sujal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess(text):
    # Remove unnecessary characters
    text = text.replace("<br />", " ")
    # Convert to lowercase
    text = text.lower()
    # Remove stop words and stem words
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

In [10]:
# Load the model from a file
with open("feedback_predict.pkl", "rb") as f:
    model = pickle.load(f)
with open("feedback_vectorizer.pkl", "rb") as f:
    vector = pickle.load(f)

# Use the trained model to predict the feedback value of external reviews
external_review = "worst turf"
external_review_vect = vector.transform([preprocess(external_review)])
external_feedback = model.predict(external_review_vect)[0]
print("Feedback value for external review:", external_feedback)

Feedback value for external review: 2
