In [1]:
!pip install -q numpy pandas scikit-learn nltk
# Install the Python packages needed for training and running the demo.
# - numpy, pandas: data handling
# - scikit-learn: vectorizer & classifier
# - nltk: tokenization, stopwords, preprocessing helpers


In [2]:
# Import libraries and download NLTK corpora needed for tokenization and stopwords.
# Keep downloads in the notebook so it runs on a fresh Colab instance.
import os, pickle, re, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss
import nltk
# Download required NLTK data (punkt for tokenization, stopwords for filtering)
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from google.colab import files
uploaded = files.upload()
# Opens a file upload dialog in Colab so you can upload 'Restaurant_Reviews.tsv' from your machine.
# The uploaded file will be available in the Colab working directory.
# After running, select the tsv file to upload.


Saving Restaurant_Reviews.tsv to Restaurant_Reviews (1).tsv


In [4]:
# Read the uploaded TSV file into a pandas DataFrame; tab ('\t') as the delimiter.
# This expects a file named 'Restaurant_Reviews.tsv' uploaded in the previous cell.
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')
# Show a quick preview to confirm the data loaded correctly.
df.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Prepare stopword set and stemmer for text normalization.
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
# Preprocess function:
# 1. Keep only letters (remove numbers/punctuation).
# 2. Tokenize into words.
# 3. Lowercase, remove stopwords and 1-letter tokens.
# 4. Apply Porter stemming.
def preprocess(text):
# Replace non-letter characters with spaces
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    # Tokenize into words using NLTK
    tokens = nltk.word_tokenize(text.lower())
    # Remove stopwords and short tokens, then stem
    tokens = [ps.stem(w) for w in tokens if w not in stop_words and len(w) > 1]
    # Return cleaned text as a single string
    return ' '.join(tokens)
    # Apply preprocessing and ensure label column is integer
df['clean_review'] = df['Review'].apply(preprocess)
df['Liked'] = df['Liked'].astype(int)


In [7]:
# Convert cleaned text to TF-IDF features.
# max_features limits vocabulary size for speed and to avoid noisy rare words.
tfv = TfidfVectorizer(max_features=5000)
X_tfidf = tfv.fit_transform(df['clean_review']).toarray()
# Labels
y = df['Liked'].values
# Split into train/test sets. random_state ensures reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [8]:
# Train a Logistic Regression classifier on the TF-IDF features.
# LogisticRegression provides predict_proba which we use for confidence scores.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Predict on the test set and print standard evaluation metrics.
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
if hasattr(model,'predict_proba'):
    print('Sample predict_proba on first test row:', model.predict_proba(X_test[:1]).tolist())


Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.68      0.85      0.76        96
           1       0.82      0.63      0.72       104

    accuracy                           0.74       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.76      0.74      0.74       200

Sample predict_proba on first test row: [[0.642460775721672, 0.357539224278328]]


In [9]:
# Persist the trained model and vectorizer so they can be used in the Flask app.
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfv, f)
    # Optional: download the files to your machine from Colab
from google.colab import files as gfiles
gfiles.download('model.pkl')
gfiles.download('vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# Define keywords for each aspect; this is a simple rule-based aspect matcher.
aspects_keywords = {
    'food': ['food','taste','flavor','dish','meal'],
    'service': ['service','staff','waiter','waitress','server'],
    'speed': ['quick','slow','speed','time','wait'],
    'hygiene': ['hygiene','clean','dirty','sanitary','unclean'],
    'ambience': ['ambience','ambiance','atmosphere','music','decor'],
    'price': ['price','cost','expensive','cheap','value']
}

# Extract sentences that mention an aspect, then use the trained model to classify
# each sentence's sentiment. This avoids using pre-trained sentiment lexicons.
def extract_aspects_using_model(text):
    sentences = nltk.sent_tokenize(text)
    found = {}
    for sent in sentences:
        sent_lower = sent.lower()
        for asp,kws in aspects_keywords.items():
            for kw in kws:
                if kw in sent_lower:
                   # Preprocess the sentence and vectorize using the TF-IDF fitted above
                    t = preprocess(sent)
                    v = tfv.transform([t]).toarray()
                     # Use the trained model to predict label and probability
                    pred = int(model.predict(v)[0])
                    prob = model.predict_proba(v).max() if hasattr(model,'predict_proba') else None
                    if asp not in found:
                        found[asp] = []
                    found[asp].append({'sentence': sent, 'pred': int(pred), 'probability': float(prob) if prob is not None else None})
    return found
    # Quick example to verify aspect extraction works
print(extract_aspects_using_model('The food was great but the service was slow and the place was dirty.'))


{'food': [{'sentence': 'The food was great but the service was slow and the place was dirty.', 'pred': 1, 'probability': 0.5739382211772909}], 'service': [{'sentence': 'The food was great but the service was slow and the place was dirty.', 'pred': 1, 'probability': 0.5739382211772909}], 'speed': [{'sentence': 'The food was great but the service was slow and the place was dirty.', 'pred': 1, 'probability': 0.5739382211772909}], 'hygiene': [{'sentence': 'The food was great but the service was slow and the place was dirty.', 'pred': 1, 'probability': 0.5739382211772909}]}


In [11]:
# Wrapper that predicts overall review sentiment and extracts aspect-level sentiment.
def predict_review_full(text):
  # Wrapper that predicts overall review sentiment and extracts aspect-level sentiment.
    t = preprocess(text)
    v = tfv.transform([t]).toarray()
    pred = int(model.predict(v)[0])
    prob = model.predict_proba(v).max() if hasattr(model,'predict_proba') else None
 # Extract per-aspect sentence-level labels using the same trained model
    aspects = extract_aspects_using_model(text)
    return {'prediction':pred,'probability':float(prob) if prob is not None else None,'aspects':aspects}
    # Quick test
print(predict_review_full('Not tasty and the texture was just nasty'))


{'prediction': 0, 'probability': 0.5640681000503404, 'aspects': {}}
