In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report



In [2]:
df = pd.read_csv('Dataset.csv')


In [3]:
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
sentiment_mapping = {'positive': 1, 'negative': 0}
df['sentiment_numeric'] = df['sentiment'].map(sentiment_mapping)


In [6]:
df.shape
df.head(4)

Unnamed: 0,review,sentiment,sentiment_numeric
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0


In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jamshaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocess_review(review):
    # Convert to lowercase
    review = review.lower()
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Split the review into words
    review = review.split()
    # Stemming and removing stopwords
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')       
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # Join the processed words back into a string
    review = ' '.join(review)
    return review


In [9]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['review'])
y = df['sentiment_numeric']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)



In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4448  587]
 [ 498 4467]]


0.8915

In [13]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5035
           1       0.88      0.90      0.89      4965

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [14]:
# Prediction
def predict_sentiment(review, vectorizer, model):
    review_tfidf = vectorizer.transform([review])
    sentiment = model.predict(review_tfidf)[0]
    probability = model.predict_proba(review_tfidf)[0][1] if sentiment == 1 else 1 - model.predict_proba(review_tfidf)[0][1]
    return sentiment, probability





new_review =   " A wonderful little production.The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only has got all the polari but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done."
sentiment, probability = predict_sentiment(new_review, tfidf_vectorizer, model)
if sentiment == 1:
    print("POSITIVE REVIEW. Probability of customer being happy:", probability * 100, "%")
else:
    print("NEGATIVE REVIEW. Probability of customer being happy:", (1 - probability) * 100, "%")


POSITIVE REVIEW. Probability of customer being happy: 93.0761166069932 %
