In [1]:
import re
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
# Load the model
model = joblib.load('model/model.pkl')

# Load the vectorizer
vectorizer = joblib.load('model/vectorizer.pkl')


In [5]:
# setting up stopwords from english using nltk's corpus
# stopwords are common words that we filter out because they dont
# carry much meaning on their own eg: a, the, in etc
stop_words = set(stopwords.words('english'))
# lemmatizer is a tool that reduces the words to their root form.
# eg: loving to love, hated to hate etc
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess_text(text):
    # we convert all the text to lowercase to avoid duplicacy due to cases
    text = text.lower()  
    # we then remove any numbers from the text
    text = re.sub(r'\d+', '', text)  
    # also, we remove punctuations from the text as they donot carry much meaning for our purpose
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    # now, we split the text into individual words
    tokenized_text = word_tokenize(text)  # Tokenization
    # then, we convert each word to root word, and remove common words
    text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in stop_words]
    return ' '.join(text)

In [7]:
def predict_rating(review):
    preprocessed_review = preprocess_text(review)
    vectorized_review = vectorizer.transform([preprocessed_review]).toarray()
    predicted_rating = model.predict(vectorized_review)
    return predicted_rating

In [8]:
# creating sample reviews for testing
singleReview = ['The cloth was a great fit. I looked very good, and i felt comfortable.']
reviewSamples = ['it did not fit me at all. Absolutely hated it.', 'it was satisfactory.']

In [9]:
for i in reviewSamples:
    print(predict_rating(i))

[5]
[3]
