In [1]:
#libraries
import numpy as np
import pandas as pd
from google.colab import drive
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import joblib


In [2]:
drive.mount('/content/gdrive')
data_set = pd.read_csv('/content/gdrive/MyDrive/McDonald_s_Food_Reviews.csv',encoding='latin-1')

Mounted at /content/gdrive


In [3]:
# Print first 5 rows
print("Top 5 rows of dataset")
print(data_set.describe())

Top 5 rows of dataset
        reviewer_id     latitude      longitude
count  33396.000000  32736.000000  32736.000000
mean   16698.500000     34.442546    -90.647033
std     9640.739131      5.344116     16.594844
min        1.000000     25.790295   -121.995421
25%     8349.750000     28.655350    -97.792874
50%    16698.500000     33.931261    -81.471414
75%    25047.250000     40.727401    -75.399919
max    33396.000000     44.981410    -73.459820


In [4]:
# Print random 10 rows
print("Random 10 rows")
print(data_set.sample(10))

Random 10 rows
       reviewer_id  store_name              category  \
12800        12801  McDonald's  Fast food restaurant   
18860        18861  McDonald's  Fast food restaurant   
23680        23681  McDonald's  Fast food restaurant   
29498        29499  McDonald's  Fast food restaurant   
10882        10883  McDonald's  Fast food restaurant   
31128        31129  McDonald's  Fast food restaurant   
23992        23993  McDonald's  Fast food restaurant   
5784          5785  McDonald's  Fast food restaurant   
20005        20006  McDonald's  Fast food restaurant   
31284        31285  McDonald's  Fast food restaurant   

                                           store_address  latitude   \
12800  501 W Imperial Hwy, Los Angeles, CA 90044, Uni...  33.931261   
18860  1100 N US Hwy 377, Roanoke, TX 76262, United S...  33.009318   
23680  1415 E State Rd, Fern Park, FL 32730, United S...  28.655350   
29498  1415 E State Rd, Fern Park, FL 32730, United S...  28.655350   
10882  1650 W

In [5]:
# Print total number of rows and columns in dataset
print("Total rows and columns in dataset")
print(data_set.shape)

Total rows and columns in dataset
(33396, 10)


In [6]:
# Print number of rows and columns in dataset separately
print("Total Rows =", data_set.shape[0])
print("Total Columns =", data_set.shape[1])

Total Rows = 33396
Total Columns = 10


In [7]:
print("Columns in dataset")
print(data_set.columns)

Columns in dataset
Index(['reviewer_id', 'store_name', 'category', 'store_address', 'latitude ',
       'longitude', 'rating_count', 'review_time', 'review', 'rating'],
      dtype='object')


In [8]:
#Check information of dataset
print("Dataset information")
print(data_set.info())

Dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reviewer_id    33396 non-null  int64  
 1   store_name     33396 non-null  object 
 2   category       33396 non-null  object 
 3   store_address  33396 non-null  object 
 4   latitude       32736 non-null  float64
 5   longitude      32736 non-null  float64
 6   rating_count   33396 non-null  object 
 7   review_time    33396 non-null  object 
 8   review         33396 non-null  object 
 9   rating         33396 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 2.5+ MB
None


In [9]:
# Drop unnecessary columns

print("Dropping extra columns")
columns_to_drop = ['store_name', 'category', 'latitude ', 'longitude']
data_set.drop(columns=columns_to_drop, inplace=True)  # Use inplace=True to modify data_set in place

Dropping extra columns


In [10]:
data_set.describe()

Unnamed: 0,reviewer_id
count,33396.0
mean,16698.5
std,9640.739131
min,1.0
25%,8349.75
50%,16698.5
75%,25047.25
max,33396.0


In [11]:
data_set.head()

Unnamed: 0,reviewer_id,store_address,rating_count,review_time,review,rating
0,1,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [12]:
#Check for duplicate values
print("Checking for duplicate data")
print("Total Duplicated values =", data_set.duplicated().sum())

Checking for duplicate data
Total Duplicated values = 0


In [13]:
#Check for null values
print("Checking for null values")
print("Total NULL values =\n\n",data_set.isnull().sum())

Checking for null values
Total NULL values =

 reviewer_id      0
store_address    0
rating_count     0
review_time      0
review           0
rating           0
dtype: int64


In [14]:
print("Sort by no of ratings")
print(data_set['rating'].value_counts())

Sort by no of ratings
5 stars    10274
1 star      9431
4 stars     5787
3 stars     4818
2 stars     3086
Name: rating, dtype: int64


In [None]:
nltk.download('all')

Text -> Convert Accented Characters -> Tokenization -> Stopwords Removal -> Removing Numbers and Extra Whitespaces -> Lemmatization -> Stemming -> Removing Single Letters

In [18]:
stopwords_set = set(stopwords.words('english'))
def processing(text):

    # Step 1: Remove Accented Characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Step 2: Tokenization
    tokens = word_tokenize(text)

    # Step 3: Stopwords Removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Step 4: Remove Numbers and Extra Whitespaces
    filtered_tokens = [re.sub(r'\d+', '', word) for word in filtered_tokens]
    filtered_tokens = [word.strip() for word in filtered_tokens if word.strip()]

    # Step 5: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Step 6: Stemming (optional)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]

    # Step 7: Remove Single Letters
    filtered_tokens = [word for word in stemmed_tokens if len(word) > 1]

    # Join the tokens back into a clean text string
    clean_text = ' '.join(filtered_tokens)


    return clean_text

In [19]:
# Example text with unreadable characters
# text = "Worst McDonaldï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿"
# clean_text="Worst McDonald"

# def convert_time_to_days(time_str):
#     time_str=str(time_str)
#     if "month" in time_str:
#       if time_str[0] == "a":
#         return 30
#       else:
#         return 30*int(time_str[0])
#     elif "year" in time_str:
#       if time_str[0] == "a":
#         return 365
#       else:
#         return 365*int(time_str[0])
#     elif "week" in time_str:
#       if time_str[0] == "a":
#         return 7
#       else:
#         return 7*int(time_str[0])
#     elif "day" in time_str:
#       if time_str[0] == "a":
#         return 1
#       else:
#         return int(time_str[0])
#     else:
#         return None

i=0
for reviews in data_set['review']:
  data_set.at[i,'review'] = processing(reviews)
  # data_set.at[i,'rating']= data_set.at[i,'rating'][0]
  # data_set.at[i,'review_time'] = convert_time_to_days(data_set.at[i,'review_time'])
  # print(convert_time_to_days(data_set.at[i,'review_time']))
  # if i==5:
  #   break
  i+=1
data_set.head()



Unnamed: 0,reviewer_id,store_address,rating_count,review_time,review,rating
0,1,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,3 months ago,look like someon spit food normal transact eve...,1 star
1,2,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,5 days ago,'d mcdonald far food atmospher go staff make d...,4 stars
2,3,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,5 days ago,made mobil order got speaker check line move l...,1 star
3,4,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,a month ago,mc crispi chicken sandwich iiiiiiiiiiiiiiiiiii...,5 stars
4,5,"13749 US-183 Hwy, Austin, TX 78750, United States",1240,2 months ago,repeat order time drive thru still manag mess ...,1 star


In [20]:
random_sample = data_set.sample(n=20, random_state=42)
random_sample

Unnamed: 0,reviewer_id,store_address,rating_count,review_time,review,rating
28477,28478,"5725 W Irlo Bronson Memorial Hwy, Kissimmee, F...",5567,2 years ago,somehow forgot give order ... big deal need ex...,2 stars
20807,20808,"2400 Alliance Gateway Fwy, Fort Worth, TX 7617...",957,2 years ago,neutral,3 stars
23801,23802,"1415 E State Rd, Fern Park, FL 32730, United S...",1617,a year ago,drive thru littl slow friendli peopl food par,5 stars
4534,4535,"724 Broadway, New York, NY 10003, United States",1670,4 years ago,normal mcdonald 's much els say,3 stars
27250,27251,"10901 Riverside Dr, North Hollywood, CA 91602,...",1794,4 years ago,good servic,5 stars
866,867,"1698 US-209, Brodheadsville, PA 18322, United ...",1028,3 years ago,went drive-through order doubl mac meal receiv...,1 star
30348,30349,"9814 International Dr, Orlando, FL 32819, Unit...",5468,5 years ago,stop januari th order counter signatur classic...,3 stars
25313,25314,"632 S R L Thornton Freeway Service Rd, Dallas,...",2657,4 months ago,realli enjoy breakfast iiiiiiiiiiiiiiiiiiiiiii...,4 stars
7729,7730,"550 Lawrence Expy, Sunnyvale, CA 94086, United...",1562,2 years ago,neutral,3 stars
28389,28390,"5725 W Irlo Bronson Memorial Hwy, Kissimmee, F...",5567,4 years ago,terribl .. employe name andrea wors custom ser...,1 star


In [21]:
sia = SentimentIntensityAnalyzer()
sentiments = []

# text_weight = 0.6
# time_weight = 0.2
# count_weight = 0.1
# store_weight = 0.1

# for index, row in data_set.iterrows():
#     compound_score = 0.0  # Initialize the compound score for each row

#     # Iterate through the columns dynamically
#     for column in data_set.columns:
#         if column == 'review':
#             text = row[column]
#             sentiment = sia.polarity_scores(text)
#             compound_score += (text_weight * sentiment['compound'])
#         elif column == 'review_time':
#             time = row[column]
#             sentiment = sia.polarity_scores(time)
#             compound_score += (time_weight * sentiment['compound'])
#         elif column == 'rating_count':
#             count = row[column]
#             sentiment = sia.polarity_scores(count)
#             compound_score += (count_weight * sentiment['compound'])
#         elif column == 'store_address':
#             address = row[column]
#             sentiment = sia.polarity_scores(address)
#             compound_score += (store_weight * sentiment['compound'])
#         else:
#             # You can handle other columns as needed
#             pass

#     sentiments.append(compound_score)


for review in data_set['review']:
    sentiment = sia.polarity_scores(review)
    sentiments.append(sentiment)

In [22]:
i=0
while i!=10:
  print(sentiments[i])
  i+=1

{'neg': 0.047, 'neu': 0.769, 'pos': 0.184, 'compound': 0.5541}
{'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'compound': 0.7003}
{'neg': 0.059, 'neu': 0.941, 'pos': 0.0, 'compound': -0.25}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.2, 'neu': 0.8, 'pos': 0.0, 'compound': -0.7184}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.025, 'neu': 0.951, 'pos': 0.023, 'compound': -0.0423}
{'neg': 0.034, 'neu': 0.813, 'pos': 0.153, 'compound': 0.7003}
{'neg': 0.125, 'neu': 0.875, 'pos': 0.0, 'compound': -0.25}
{'neg': 0.024, 'neu': 0.636, 'pos': 0.34, 'compound': 0.9759}


In [23]:
sentiment_labels = []

for sentiment in sentiments:
    compound_score = sentiment['compound']
    # compound_score = sentiment
    if compound_score >= 0.05:
        sentiment_labels.append(1)
    elif compound_score <= -0.05:
        sentiment_labels.append(-1)
    else:
        sentiment_labels.append(0)

In [24]:
# Add the sentiment labels to the DataFrame

#PPNONPPNPP
data_set['sentiment'] = sentiment_labels
data_set[['review', 'sentiment']]

Unnamed: 0,review,sentiment
0,look like someon spit food normal transact eve...,1
1,'d mcdonald far food atmospher go staff make d...,1
2,made mobil order got speaker check line move l...,-1
3,mc crispi chicken sandwich iiiiiiiiiiiiiiiiiii...,0
4,repeat order time drive thru still manag mess ...,-1
...,...,...
33391,treat badli,1
33392,servic good,1
33393,remov hunger enough,-1
33394,'s good late becom expens,1


In [25]:
X = data_set['review']
y = data_set['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [27]:
# Create a LinearSVC model

linear_svc_model = LinearSVC()

# Fit the model to the TF-IDF training data
linear_svc_model.fit(X_train_tfidf, y_train)

# Predict on the TF-IDF test data
linear_svc_y_pred = linear_svc_model.predict(X_test_tfidf)

# Calculate accuracy and print the results
linear_svc_accuracy = accuracy_score(y_test, linear_svc_y_pred)
print("LinearSVC Accuracy:", linear_svc_accuracy*100)
print("Classification Report:")
print(classification_report(y_test, linear_svc_y_pred))

LinearSVC Accuracy: 93.83233532934132
Classification Report:
              precision    recall  f1-score   support

          -1       0.90      0.86      0.88      1384
           0       0.94      0.96      0.95      1745
           1       0.95      0.96      0.95      3551

    accuracy                           0.94      6680
   macro avg       0.93      0.93      0.93      6680
weighted avg       0.94      0.94      0.94      6680



In [28]:
# Create a MultinomialNB model
naive_bayes_model = MultinomialNB()

# Fit the model to the TF-IDF training data
naive_bayes_model.fit(X_train_tfidf, y_train)

# Predict on the TF-IDF test data
naive_bayes_y_pred = naive_bayes_model.predict(X_test_tfidf)

# Calculate accuracy and print the results
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_y_pred)
print("Multinomial Naive Bayes Accuracy:", naive_bayes_accuracy*100)
print("Classification Report:")
print(classification_report(y_test, naive_bayes_y_pred))

Multinomial Naive Bayes Accuracy: 74.8502994011976
Classification Report:
              precision    recall  f1-score   support

          -1       0.85      0.61      0.71      1384
           0       0.96      0.40      0.56      1745
           1       0.70      0.98      0.81      3551

    accuracy                           0.75      6680
   macro avg       0.84      0.66      0.69      6680
weighted avg       0.80      0.75      0.73      6680



In [29]:
# Create a Logistic Regression model
logistic_regression_model = LogisticRegression()

# Fit the model to the TF-IDF training data
logistic_regression_model.fit(X_train_tfidf, y_train)

# Predict on the TF-IDF test data
logistic_regression_y_pred = logistic_regression_model.predict(X_test_tfidf)

# Calculate accuracy and print the results
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_y_pred)
print("Logistic Regression Accuracy:", logistic_regression_accuracy*100)
print("Classification Report:")
print(classification_report(y_test, logistic_regression_y_pred))

Logistic Regression Accuracy: 91.96107784431138
Classification Report:
              precision    recall  f1-score   support

          -1       0.89      0.80      0.84      1384
           0       0.89      0.96      0.92      1745
           1       0.95      0.95      0.95      3551

    accuracy                           0.92      6680
   macro avg       0.91      0.90      0.90      6680
weighted avg       0.92      0.92      0.92      6680



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# Create a Random Forest model
random_forest_model = RandomForestClassifier()

# Fit the model to the TF-IDF training data
random_forest_model.fit(X_train_tfidf, y_train)

# Predict on the TF-IDF test data
random_forest_y_pred = random_forest_model.predict(X_test_tfidf)

# Calculate accuracy and print the results
random_forest_accuracy = accuracy_score(y_test, random_forest_y_pred)
print("Random Forest Accuracy:", random_forest_accuracy*100)
print("Classification Report:")
print(classification_report(y_test, random_forest_y_pred))

Random Forest Accuracy: 92.09580838323353
Classification Report:
              precision    recall  f1-score   support

          -1       0.92      0.79      0.85      1384
           0       0.90      0.94      0.92      1745
           1       0.93      0.96      0.94      3551

    accuracy                           0.92      6680
   macro avg       0.92      0.90      0.91      6680
weighted avg       0.92      0.92      0.92      6680



In [31]:
model1 = LinearSVC()
model2 = MultinomialNB()
model3 = LogisticRegression()

ensemble_model = VotingClassifier(
    estimators=[('model1', model1), ('model2', model2), ('model3', model3)],
    voting='hard'  # You can choose 'hard' or 'soft' voting depending on your use case
)


# Fit the ensemble model on the training data
ensemble_model.fit(X_train_tfidf, y_train)

# Make predictions using the ensemble model
y_pred = ensemble_model.predict(X_test_tfidf)

# Evaluate the performance of the ensemble model (you can use different metrics)
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Model Accuracy: {accuracy:.2f}')

# Finally, you can use the trained ensemble model to make predictions on your test data
y_test_pred = ensemble_model.predict(X_test_tfidf)

Ensemble Model Accuracy: 0.94


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Save the ensemble model
joblib.dump(ensemble_model, 'ensemble_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [33]:
loaded_model = joblib.load('ensemble_model.joblib')
loaded_vectorizer = joblib.load('tfidf_vectorizer.joblib')
Input="The delectable aroma of the freshly baked bread filled the air as I stepped into the cozy bakery. The warm, crusty baguette I ordered was a delightful treat. It had a perfect crunch on the outside and a soft, airy interior. Absolutely delicious!"
input_vector = loaded_vectorizer.transform([processing(Input)])
predicted_rating = loaded_model.predict(input_vector)
print("Predicted Rating:", predicted_rating)

Predicted Rating: [1]


##gpt prompt

give me food review inputs to check if my model is good or not give one positive one neutral and one negative inputs and make inputs so hard to guess so i can check my classifier


Sure, here are some food review inputs with varying sentiments for testing your classifier:

Positive Review:

Input: "The delectable aroma of the freshly baked bread filled the air as I stepped into the cozy bakery. The warm, crusty baguette I ordered was a delightful treat. It had a perfect crunch on the outside and a soft, airy interior. Absolutely delicious!"


Neutral Review:

Input: "I visited the restaurant last night. The service was decent, and the ambiance was pleasant. However, the food was just average. It didn't stand out, but it wasn't terrible either."


Negative Review:

Input: "My experience at the restaurant was highly disappointing. The waitstaff seemed disinterested, and it took forever to get our orders. When the food finally arrived, it was cold, tasteless, and overpriced. I won't be returning."
These reviews cover a range of sentiments from positive to neutral to negative. You can use these inputs to check how your classifier predicts the sentiment of each review and evaluate its performance.

In [34]:
# #Vectorizing the Words
# tf_vec = TfidfVectorizer()
# #['store address','rating_count','review_time','review']
# features = tf_vec.fit_transform(data_set['store address','rating_count','review_time','review'])
# X = features
# y = data_set['rating']


# vect=TfidfVectorizer()
# StoreAddress=vect.fit_transform(data_set['store_address'])
# Review=vect.fit_transform(data_set['review'])

# X_numerical = data_set[['rating_count', 'review_time']].values

# X_combined = np.hstack((X_numerical, StoreAddress.toarray(), Review.toarray()))


# X = np.asarray(data_set[['store_address','rating_count','review_time','review']])
# Y = np.asarray(data_set['rating'])

In [35]:
# label_encoder = LabelEncoder()
# data_set['rating'] = label_encoder.fit_transform(data_set['rating'])

In [36]:
# #Splitting the Dataset into Train and Test Sets

# X = data_set[['store_address', 'rating_count', 'review_time', 'review']]
# y = data_set['rating']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
# # LinearSVC
# linear_svc_model = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', LinearSVC())
# ])

# # Naive Bayes
# naive_bayes_model = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB())
# ])

# # Logistic Regression
# logistic_regression_model = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', LogisticRegression(max_iter=1000))
# ])

# random_forest_model = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', RandomForestClassifier())  # Use RandomForestClassifier here
# ])

# decision_tree_model = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', DecisionTreeClassifier())  # Use DecisionTreeClassifier here
# ])

In [38]:
# # Fit and evaluate the LinearSVC model
# linear_svc_model.fit(X_train['review'], y_train)
# linear_svc_predictions = linear_svc_model.predict(X_test['review'])
# linear_svc_accuracy = accuracy_score(y_test, linear_svc_predictions)
# linear_svc_confusionmatrix = confusion_matrix(y_test, linear_svc_predictions)
# linear_svc_classificationreport = classification_report(y_test, linear_svc_predictions)

# # Fit and evaluate the Naive Bayes model
# naive_bayes_model.fit(X_train['review'], y_train)
# naive_bayes_predictions = naive_bayes_model.predict(X_test['review'])
# naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_predictions)
# naive_bayes_confusionmatrix = confusion_matrix(y_test, naive_bayes_predictions)
# naive_bayes_classificationreport = classification_report(y_test, naive_bayes_predictions)

# # Fit and evaluate the Logistic Regression model
# logistic_regression_model.fit(X_train['review'], y_train)
# logistic_regression_predictions = logistic_regression_model.predict(X_test['review'])
# logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)
# logistic_regression_confusionmatrix = confusion_matrix(y_test, logistic_regression_predictions)
# logistic_regression_classificationreport = classification_report(y_test, logistic_regression_predictions)

# print("LinearSVC Metrics:")
# print(f"Accuracy: {linear_svc_accuracy}")
# print("Confusion Matrix:")
# print(linear_svc_confusionmatrix)
# print("Classification Report:")
# print(linear_svc_classificationreport)
# print("\n")

# print("Naive Bayes Metrics:")
# print(f"Accuracy: {naive_bayes_accuracy}")
# print("Confusion Matrix:")
# print(naive_bayes_confusionmatrix)
# print("Classification Report:")
# print(naive_bayes_classificationreport)
# print("\n")

# print("Logistic Regression Metrics:")
# print(f"Accuracy: {logistic_regression_accuracy}")
# print("Confusion Matrix:")
# print(logistic_regression_confusionmatrix)
# print("Classification Report:")
# print(logistic_regression_classificationreport)


In [39]:
# # Fit and evaluate the Random Forest model
# random_forest_model.fit(X_train['review'], y_train)
# random_forest_predictions = random_forest_model.predict(X_test['review'])
# random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
# random_forest_confusionmatrix = confusion_matrix(y_test, random_forest_predictions)
# random_forest_classificationreport = classification_report(y_test, random_forest_predictions)

# # Print the metrics for the Random Forest model
# print("Random Forest Metrics:")
# print(f"Accuracy: {random_forest_accuracy}")
# print("Confusion Matrix:")
# print(random_forest_confusionmatrix)
# print("Classification Report:")
# print(random_forest_classificationreport)

In [40]:
# # Fit and evaluate the Decision Tree model
# decision_tree_model.fit(X_train['review'], y_train)
# decision_tree_predictions = decision_tree_model.predict(X_test['review'])
# decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)
# decision_tree_confusionmatrix = confusion_matrix(y_test, decision_tree_predictions)
# decision_tree_classificationreport = classification_report(y_test, decision_tree_predictions)

# # Print the metrics for the Decision Tree model
# print("Decision Tree Metrics:")
# print(f"Accuracy: {decision_tree_accuracy}")
# print("Confusion Matrix:")
# print(decision_tree_confusionmatrix)
# print("Classification Report:")
# print(decision_tree_classificationreport)

In [41]:
# # Save the models to a file
# joblib.dump(linear_svc, 'linear_svc_model.joblib')
# joblib.dump(naive_bayes, 'naive_bayes_model.joblib')
# joblib.dump(logistic_regression, 'logistic_regression_model.joblib')

In [42]:
# loaded_model = joblib.load('naive_bayes_model.joblib')
# Input=input("Enter an input  : ")
# input_vector = tf_vec.transform([processing(Input)])
# predicted_rating = loaded_model.predict(input_vector)[0]
# print("Predicted Rating:", predicted_rating)

In [43]:
# loaded_model = joblib.load('linear_svc_model.joblib')
# Input=input("Enter an input  : ")
# input_vector = tf_vec.transform([processing(Input)])
# predicted_rating = loaded_model.predict(input_vector)[0]
# print("Predicted Rating:", predicted_rating)

In [44]:
# loaded_model = joblib.load('logistic_regression_model.joblib')
# Input=input("Enter an input  : ")
# input_vector = tf_vec.transform([processing(Input)])
# predicted_rating = loaded_model.predict(input_vector)[0]
# print("Predicted Rating:", predicted_ratin  g)