In [1]:
import warnings
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import string
stemmer = nltk.SnowballStemmer("english")
warnings.filterwarnings('ignore')
data=pd.read_csv('Data.csv')


In [2]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
# Cleaning
stopword = set(stopwords.words('english'))
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text


data["review"] = data["review"].apply(clean)


In [4]:
# Preparation for model
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
y=label.fit_transform(data['sentiment'])
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(data['review']).toarray()


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

log=LogisticRegression()
ber=BernoulliNB()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# A comparison of the two models showed that the LogisticRegression(0.8714) is better than the BernoulliNB(0.8412),
# so I decided to set up hyperparameters for LogisticRegression (0.8757) 
# but it's too much for fitting and tunning and difference is small
'''param_grid = {'penalty': ['l2'],  'C': [0.01 + i * 0.01 for i in range(10)]}
grid_search = GridSearchCV(
    LogisticRegression(), param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)'''

log.fit(X_train, y_train)
ber.fit(X_train, y_train)


y_log = log.predict(X_test)
y_ber=ber.predict(X_test)

log_accur=accuracy_score(y_test,y_log)
ber_accur=accuracy_score(y_test,y_ber)

In [6]:
# My results
log_accur,ber_accur

(0.8714, 0.8412)

In [7]:
def check(text):
    for_model = clean(text)
    for_model = cv.transform([for_model])
    y_pred = log.predict(for_model)
    y_pred = label.inverse_transform(y_pred)
    return y_pred[0]

text_to_predict = input('Enter your review or sentence: ')
print(f'Your text evaluation is {check(text_to_predict)}')


Your text evaluation is positive
