In [235]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [236]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [237]:
# read a dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/imdb master project/4_imdb_master.csv', encoding='ISO-8859-1')
data.shape

data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.drop(['file'], axis=1, inplace=True)
data.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


# data preprocessing

In [238]:
# check dataset
print(data.shape)
print(data.columns)
print(data['label'].value_counts())
print(data[data['type'] == 'train'].value_counts())

(100000, 3)
Index(['type', 'review', 'label'], dtype='object')
unsup    50000
neg      25000
pos      25000
Name: label, dtype: int64
type   review                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [239]:
data.info()
# check any null in dataset
data.isnull().values.any()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   type    100000 non-null  object
 1   review  100000 non-null  object
 2   label   100000 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


False

In [240]:
# cleaning the text
import re
def cleaning(sen):
  sen = re.sub('[^A-Za-z]+', ' ', sen)
  sen = re.sub(r"\s+[a-zA-Z]\s+", ' ',sen)
  sen = re.sub(r'\s+',' ',sen)
  sen = re.sub(r'<[^>]+>', ' ',sen)
  return sen

data['review'] = data['review'].apply(cleaning)
data.head()

Unnamed: 0,type,review,label
0,test,Once again Mr Costner has dragged out movie fo...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,First of all hate those moronic rappers who co...,neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures movies is not fitting word for ...,neg


In [241]:
from nltk.tokenize.toktok import ToktokTokenizer
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')


#set stopwords to english
stopword=set(stopwords.words('english'))
print(stopword)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
#Apply function on review column
data['review']=data['review'].apply(remove_stopwords)
data.head()

{'where', "you'd", 'which', 'was', 'him', 'off', 'other', 'having', 'can', 'myself', "doesn't", 'what', 'this', 'haven', 'being', 'my', 'both', 'y', 'why', 'up', 'does', 'until', 'hadn', 'shouldn', 'yours', 'll', 'then', "it's", 'is', "aren't", 'm', 'herself', 'than', 'how', 'or', 'do', 'some', 'a', 'too', 're', "needn't", 'but', 'don', 'itself', 'the', 'very', 'themselves', 'am', 'once', 'through', 'had', 'himself', 'while', 'here', 'when', 'isn', "she's", 'because', 'did', 'there', "you've", 'from', "shouldn't", 'again', 'ours', 'before', 'in', 'shan', 'same', 'wouldn', 'more', 'most', 'after', 'only', 'with', 'aren', 'over', 'yourself', 'out', 'who', "couldn't", 'doesn', 'its', 'further', 'as', 'if', 'hasn', 'no', "you'll", "that'll", 'each', 'by', 'should', 'you', 't', "you're", 'on', "don't", 'ain', 'your', 'those', 'own', "wouldn't", 'd', 'they', 'she', "haven't", 'were', 'their', 'now', 'between', 'he', 'yourselves', 'have', 'doing', 'ourselves', 'against', 'these', 'has', 'coul

Unnamed: 0,type,review,label
0,test,Mr Costner dragged movie far longer necessary ...,neg
1,test,example majority action films Generic boring r...,neg
2,test,First hate moronic rappers could nt act gun pr...,neg
3,test,even Beatles could write songs everyone liked ...,neg
4,test,Brass pictures movies fitting word really some...,neg


In [267]:
x = data['review']
y = data['label']
#y = np.array([1 if x=="pos" else (0 if x=="unsup" else -1) for x in y])

In [268]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y, test_size=0.20, random_state=225)

In [273]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
tfidf = TfidfVectorizer(lowercase=True)

# Logistic Regression

In [274]:
from sklearn.linear_model import LogisticRegression
logicRe =LogisticRegression(solver='lbfgs', max_iter=3000)
lr = Pipeline([('vectorizer',tfidf),('classifier',logicRe)])
lr.fit(X_train,Y_train)
lrpred = lr.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy of Logistic Regression => ",accuracy_score(lrpred,Y_test)*100)

Accuracy of Logistic Regression =>  62.975


In [275]:
new_text = ["The comments already posted for this film do not do it justice. It is a very low budget sci-fi film made by lesser known people and should be judged that way. To say the film shouldn't be made is plain wrong. How do people learn without doing? Small pictures should be encouraged, not made fun of just because they are not of the highest quality."]
prediction = lr.predict(new_text)

# Displaying Results
for text, prediction_index in zip(new_text,prediction):
    if prediction_index == "pos":
        prediction_label = 1
    elif prediction_index == "unsup":
        prediction_label = 0
    else:
        prediction_label = -1

print("Predicted class:", prediction_label)

Predicted class: 0


In [276]:
new_text = ["This is an enjoyable movie. Its very realistic to the wonderful world of music I've been there and done that. It shows a human element in each character and the realism that nobody is perfect. These amateur musicians weren't all that bad players. Cleavon Little's character, Marshall Tucker, was played very well. Marshall was no saint himself. Here he was getting paid to do a job and he's giving these guys a hard time about everything in the van on the way up there. You don't bite the hands that feed you. I do find it hard to believe that a player with the jazz experience he has, claims he does not know any of the dixieland tunes. He has a tremendous sense of predicting chord changes to tunes he does not know. Not common, but not unheard of either. He delivers a true and harsh message at the end of the movie when he tells the clarinet player, its not a religion, devotion is not enough. On that level, he is correct, although I think the clarinet player could have handled the job. He was practicing his butt off and vocal accompaniment music is not that hard to read. Very enjoyable movie."]
prediction = lr.predict(new_text)

# Displaying Results
for text, prediction_index in zip(new_text,prediction):
    if prediction_index == "pos":
        prediction_label = 1
    elif prediction_index == "unsup":
        prediction_label = 0
    else:
        prediction_label = -1

print("Predicted class:", prediction_label)

Predicted class: 1
