In [1]:
import numpy as np 
import pandas as pd

In [2]:
imdb_dataset = pd.read_csv('IMDB Dataset.csv', nrows=1000)
imdb_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_dataset['sentiment'].value_counts()    

sentiment
positive    501
negative    499
Name: count, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imdb_dataset['review'], imdb_dataset['sentiment'], test_size=0.2,random_state=0)

In [5]:
X_train.shape

(800,)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

#Count vectorizer for bag of words
cv=CountVectorizer()
#transformed train reviews
cv_train_reviews=cv.fit_transform(X_train)
#transformed test reviews
cv_test_reviews=cv.transform(X_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (800, 15844)
BOW_cv_test: (200, 15844)


In [7]:
cv_train_reviews[0]

<1x15844 sparse matrix of type '<class 'numpy.int64'>'
	with 106 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LogisticRegression

#training the model
lr=LogisticRegression()
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,y_train)
print(lr_bow)

LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

lr_bow_predict=lr.predict(cv_test_reviews)

#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)

print(lr_bow_score)

0.795


In [10]:
#Classification report for bag of words 
lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

    Positive       0.84      0.79      0.82       115
    Negative       0.74      0.80      0.77        85

    accuracy                           0.80       200
   macro avg       0.79      0.80      0.79       200
weighted avg       0.80      0.80      0.80       200



In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

In [12]:
import nltk  
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
#Tokenization of text
tokenizer=ToktokTokenizer()

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
print(len(stopword_list))
print(stopword_list)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [14]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
    #Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(remove_stopwords)
imdb_dataset.head()

{'s', 'do', 'are', 'should', 'them', "mustn't", 'a', 'has', "it's", 'll', 'ours', 'themselves', 'any', 'yours', 'm', 'itself', 'or', 'ourselves', 'did', 'shan', 'some', 'both', "needn't", 'here', 'this', 'was', 'through', 'more', 'further', 'he', 'theirs', 'does', 'because', 'wasn', "aren't", 'won', 'few', "mightn't", 'such', 'can', 'where', 'there', "should've", 'your', 've', 'up', 'doesn', 're', 'me', 'y', 'hadn', 'yourselves', 'weren', "won't", 'own', 'my', 'while', 'once', 'she', "you've", 'not', "isn't", 'hasn', "shan't", 'what', 'who', 'don', 'doing', 'then', "haven't", 'each', "hadn't", 'o', 'why', 'their', 'than', 'that', "shouldn't", 'is', 'after', 'am', 'mustn', "that'll", 'we', "you'll", 'will', 'you', 't', 'have', 'having', 'been', 'to', 'before', 'his', 'had', 'from', 'being', 'it', 'haven', 'into', 'above', 'as', 'be', 'ma', 'very', 'ain', 'no', 'couldn', 'against', 'if', 'whom', 'so', 'these', 'the', 'myself', "she's", 'in', 'only', 'just', 'herself', 'yourself', 'during

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production. <br / ><br / >The...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically ' family little boy ( Jake ) thinks ...,negative
4,"Petter Mattei ' "" Love Time Money "" visually s...",positive


In [15]:
from bs4 import BeautifulSoup
import re

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(denoise_text)
imdb_dataset.head()

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production. The filming techn...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically ' family little boy ( Jake ) thinks ...,negative
4,"Petter Mattei ' "" Love Time Money "" visually s...",positive


In [16]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(remove_special_characters)

In [17]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_dataset['review']=imdb_dataset['review'].apply(simple_stemmer)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imdb_dataset['review'], imdb_dataset['sentiment'], test_size=0.2,random_state=0)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Sử dụng TF-IDF Vectorizer để chuyển đổi văn bản thành ma trận TF-IDF
tfidf_vectorizer = TfidfVectorizer()
#transformed train reviews
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
#transformed test reviews
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print('TF-IDF_cv_train:',X_train_tfidf.shape)
print('TF-IDF_cv_test:',X_test_tfidf.shape)

TF-IDF_cv_train: (800, 13277)
TF-IDF_cv_test: (200, 13277)


In [20]:
#training the model
lr_preprocessed=LogisticRegression()
#Fitting the model for Bag of words
lr_bow=lr_preprocessed.fit(X_train_tfidf,y_train)
print(lr_bow)

LogisticRegression()


In [21]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

lr_bow_predict=lr_preprocessed.predict(X_test_tfidf)

#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)

print(lr_bow_score)

0.785


In [22]:
from sklearn.naive_bayes import MultinomialNB
# Mô hình Naive Bayes cho phân loại cảm xúc
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

In [23]:
nb_classifier_predict=nb_classifier.predict(X_test_tfidf)

#Accuracy score for bag of words
nb_classifier_score=accuracy_score(y_test,nb_classifier_predict)

print(nb_classifier_score)

0.765


In [24]:
from sklearn.svm import SVC
# Xây dựng mô hình SVM
svm_model = SVC(kernel='linear')  # Chọn kernel tùy thuộc vào bài toán, 'linear' là một lựa chọn phổ biến
svm_model.fit(X_train_tfidf, y_train)

In [25]:
# Dự đoán trên tập kiểm thử
y_pred = svm_model.predict(X_test_tfidf)
# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.79


In [26]:
# Biến đổi đoạn văn bản thành ma trận TF-IDF sử dụng vectorizer đã được fit trước đó
review_text = ["good movie"]
review_tfidf = tfidf_vectorizer.transform(review_text)

# Dự đoán cảm xúc
predicted_sentiment = svm_model.predict(review_tfidf)

# In kết quả
print("Predicted Sentiment:", predicted_sentiment[0])

Predicted Sentiment: positive


In [34]:
from sklearn.pipeline import Pipeline
clf = Pipeline([('tfidf_vectorizer',TfidfVectorizer()), ('svm_model', SVC(kernel='linear'))])
clf.fit(X_train, y_train)                                                    

In [35]:
# Dự đoán trên tập kiểm thử
y_pred = clf.predict(X_test)
# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.79


In [36]:
clf.predict(['good movie'])

array(['positive'], dtype=object)

In [37]:
clf.classes_

array(['negative', 'positive'], dtype=object)

In [38]:
import pickle 
pickle.dump(clf, open('mh.pkl','wb'))

In [39]:
model = pickle.load(open('mh.pkl','rb')) 

In [40]:
model.predict(['good movie'])

array(['positive'], dtype=object)