In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')

file_path = "/content/Restaurant_Reviews 1.tsv"  # Adjust path as needed
data = pd.read_csv(file_path, delimiter='\t', quoting=3)

# Adding a column + stopwords
data['Stopwords'] = data['Review'].apply(
    lambda x: ' '.join([word for word in x.split() if word in stopwords.words('english')])
)

# Cleaning the data
data['Cleaned Data'] = data['Review'].str.lower()  # Lowercase all text
data['Cleaned Data'] = data['Cleaned Data'].str.replace('[^a-zA-Z]', ' ', regex=True)  # Remove non-alphabetic characters
data['Cleaned Data'] = data['Cleaned Data'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')])
)
data['Cleaned Data'] = data['Cleaned Data'].apply(
    lambda x: ' '.join([Word(word).lemmatize() for word in x.split()])
)

# Vectorizing
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(data['Cleaned Data']).toarray()
Y = data['Liked']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Training the model
model = LogisticRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

# Adding a column for analysis
data['Analysis'] = data['Liked'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

# Saving to Excel
output_file = "Results.xlsx"  # Output file name
data.to_excel(output_file, index=False)

# Printing accuracy and confirmation
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy of model: {accuracy * 100:.2f}%")
print(f"Results saved as {output_file}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Accuracy of model: 76.50%
Results saved as Results.xlsx
