# Importing dataset

In [1]:
import glob
import pandas as pd

In [2]:
def load_data(data_path):
    # Import data
    text_list = []
    rating_list = []
    for filename in data_path:
        text = open(filename, encoding="utf8").read() #Get the text
        filename = filename.strip(r'Movie_ratings/train/pos\\train').strip(".txt")
        rating = filename.split('_')[1] #Get the rating
        text_list.append(text)
        rating_list.append(rating)
    df = pd.DataFrame({"Rating":rating_list,"Review":text_list})

    return df

In [3]:
# Define data paths
pos_train = glob.glob("Movie_ratings/train/pos/*.txt")
neg_train = glob.glob("Movie_ratings/train/neg/*.txt")
pos_test = glob.glob("Movie_ratings/test/pos/*.txt")
neg_test = glob.glob("Movie_ratings/test/neg/*.txt")

pos_train_df = load_data(pos_train)
neg_train_df = load_data(neg_train)
pos_test_df = load_data(pos_test)
neg_testdf = load_data(neg_test)

In [4]:
neg_train_df.head()

Unnamed: 0,Rating,Review
0,3,Story of a man who has unnatural feelings for ...
1,4,Airport '77 starts as a brand new luxury 747 p...
2,4,This film lacked something I couldn't put my f...
3,1,"Sorry everyone,,, I know this is supposed to b..."
4,1,When I was little my parents took me along to ...


In [5]:
# Combining train data
train_df = pd.concat([pos_train_df, neg_train_df])

# Shuffling the train data
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,Rating,Review
0,4,This movie was disappointing. It was incomplet...
1,10,I was up late flipping cable channels one nigh...
2,9,"For starters, it's a very funny movie with a f..."
3,1,This Film was done in really poor taste. The s...
4,7,Simon Pegg plays a rude crude and often out of...
...,...,...
24995,2,Absolutely putrid slasher film has not one red...
24996,9,You'll notice by the stars I've given this GRE...
24997,1,"I saw Chan Is Missing when it first came out, ..."
24998,4,"This, the direct-to-video death rattle of the ..."


In [6]:
# Combining test data
test_df = pd.concat([pos_test_df, neg_testdf])

# Shuffling the test data
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df

Unnamed: 0,Rating,Review
0,10,The thing with Ali G is that he takes the mick...
1,1,It was inferred by a previous poster that the ...
2,4,This is defiantly a DVD rental movie. I'm a bi...
3,8,"This is a beautiful, funny, vivid film. It's e..."
4,3,The first twenty-five minutes stand out as pos...
...,...,...
24995,8,"Last night, I saw A PECK ON THE CHEEK (KANNATH..."
24996,10,"""Müllers Büro"" is a movie which many will watc..."
24997,2,I've heard people compare this movie to Sidewa...
24998,8,I'm not sure what intrigues me about this movi...


# Data Cleaning

### Creating a function to clean the text
1. Lowers the text
2. Removes punctuations and alphanumeric words
3. Removes stopwords eg: the, is, a etc
4. Lematises the text
5. remove one letter words`

In [8]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def clean_text(text):
    text = text.lower() # lower text

    text = [word.strip(string.punctuation) for word in text.split(" ")]     # tokenize text and remove puncutation

    text = [word for word in text if not any(c.isdigit() for c in word)]     # remove words that contain numbers

    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]     # remove stop words

    text = [t for t in text if len(t) > 0]     # remove empty tokens

    pos_tags = pos_tag(text)     # pos tag text

    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]     # lemmatise text

    text = [t for t in text if len(t) > 1]     # remove words with only one letter

    text = " ".join(text)     # join all
    return(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Testing cleaning data

In [9]:
text = train_df.Review[3]
print("ORIGINAL TEXT: ",text)
print()
print("CLEANED TEXT: ",clean_text(text))

ORIGINAL TEXT:  This Film was done in really poor taste. The script was really bad. I feel really sad for the late Gregory Peck who took on the title role of this B-movie adaptation of one of history's greatest generals. The movie was politically incorrect and downright insensitive to the others who fought the Japanese in World War 2. There was a scene where I almost vomited, it showed Macarthur in a bunker in Corregidor island talking to the troops like a seasoned politician when he comes across a wounded, one legged Filipino soldier. The soldier bleeding and dying manages to sit up straight upon seeing the general and says : `no papa, no mama, no uncle sam' and Macarthur gives his little pep talk that Americans `would never abandon' the Philippines. The scene ends with the soldier being invigorated by Macarthur's words and gives him a smart salute. I mean if there was a more condescending scene portraying the U.S. as the great white savior of the world please tell me because this one

In [10]:
text = train_df.Review[15]
print("ORIGINAL TEXT: ",text)
print()
print("CLEANED TEXT: ",clean_text(text))

ORIGINAL TEXT:  I like movies about morally corrupt characters, but this was too much. The acting wasn't great, but that wasn't the real problem. The issue was the sinking feeling I got in the pit of my stomach about 20 minutes into the film. These characters were hollow. They had almost no depth, and what little they did have was devoted to the cruelty they displayed to each other in the guise of friendship. Exploring the darker sides of a set of characters can be fascinating, but you have to give those characters actual personalities or they are just cardboard cutouts. These characters were cardboard and the picture they gave was just ugly.

CLEANED TEXT:  like movie morally corrupt character much act great real problem issue sink feel get pit stomach minute film character hollow almost depth little devote cruelty display guise friendship explore darker side set character fascinate give character actual personality cardboard cutouts character cardboard picture give ugly


### Cleaning all the train and test text

In [11]:
train_df["Review"] = train_df["Review"].apply(clean_text)

In [12]:
train_df

Unnamed: 0,Rating,Review
0,4,movie disappointing incomplete dull alec baldw...
1,10,late flip cable channel one night run movie mi...
2,9,starter funny movie crazy character run around...
3,1,film do really poor taste script really bad fe...
4,7,simon pegg play rude crude often control celeb...
...,...,...
24995,2,absolutely putrid slasher film one redeeming q...
24996,9,notice star i've give great film see first tim...
24997,1,saw chan miss first come four year move san fr...
24998,4,direct-to-video death rattle tremor series fea...


In [13]:
test_df["Review"] = test_df["Review"].apply(clean_text)

In [14]:
test_df

Unnamed: 0,Rating,Review
0,10,thing ali take mick character br br humour muc...
1,1,infer previous poster military would subordina...
2,4,defiantly dvd rental movie i'm big fan cast me...
3,8,beautiful funny vivid film even well nuovo cin...
4,3,first twenty-five minute stand possibly worst ...
...,...,...
24995,8,last night saw peck cheek kannathil muthamitta...
24996,10,müllers büro movie many watch enjoy end others...
24997,2,i've heard people compare movie sideways compa...
24998,8,i'm sure intrigue movie grainy poorly write bl...


### Combining data

In [15]:
reviews_df = pd.concat([train_df, test_df])

In [16]:
reviews_df

Unnamed: 0,Rating,Review
0,4,movie disappointing incomplete dull alec baldw...
1,10,late flip cable channel one night run movie mi...
2,9,starter funny movie crazy character run around...
3,1,film do really poor taste script really bad fe...
4,7,simon pegg play rude crude often control celeb...
...,...,...
24995,8,last night saw peck cheek kannathil muthamitta...
24996,10,müllers büro movie many watch enjoy end others...
24997,2,i've heard people compare movie sideways compa...
24998,8,i'm sure intrigue movie grainy poorly write bl...


Converting object to integer data type

In [17]:
reviews_df["Rating"] = reviews_df["Rating"].astype(int)

Lets use the extreme ends (Rating of 1 and 10)

In [18]:
reviews_df = reviews_df[(reviews_df['Rating'] == 1) | (reviews_df['Rating'] == 10)]

In [19]:
reviews_df.head()

Unnamed: 0,Rating,Review
1,10,late flip cable channel one night run movie mi...
3,1,film do really poor taste script really bad fe...
5,10,one would see rené clair film kind distract se...
11,10,movie awesome sort dosent really say much much...
13,10,watched movie showtime quite accident actually...


### Getting our data ready

In [20]:
# Shuffling the data
reviews_df = reviews_df.sample(frac=1).reset_index(drop=True)

In [21]:
X = reviews_df["Review"]
y = reviews_df["Rating"]

### Vectorize the text

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
x_train_matrix = CountVectorizer().fit(X)
X = x_train_matrix.fit_transform(X)

In [23]:
# Splitting our data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
X_train

<15882x57098 sparse matrix of type '<class 'numpy.int64'>'
	with 1345361 stored elements in Compressed Sparse Row format>

### Using the naive bayes classification to train the data

In [25]:
import time
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
start = time.time()
nb.fit(X_train, y_train)
end = time.time()
print("Time taken to train the model: {:.3g} seconds".format( (end-start)* 100) )

Time taken to train the model: 1.3 seconds


### Evaluate our data

In [26]:
nb.score(X_test,y_test)

0.9088390833543188

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(X_test)))

              precision    recall  f1-score   support

           1       0.89      0.94      0.91      2059
          10       0.93      0.88      0.90      1912

    accuracy                           0.91      3971
   macro avg       0.91      0.91      0.91      3971
weighted avg       0.91      0.91      0.91      3971



### Saving our model

In [29]:
# 6. Save a model and load it
import pickle
pickle.dump(nb, open("models/rating_predictor.pkl", "wb"))

In [30]:
import pickle
def rating_predictor(datapath):
    loaded_model = pickle.load(open(datapath, "rb"))
    text = input("Enter your review: ")
    start = time.time()
    predicted_rating = loaded_model.predict(x_train_matrix.transform([text]))[0]
    end = time.time()
    if predicted_rating == 1:
        print("\nYour review is : Negative")
    else:
        print("\nYour review is : Positive")
    print("\nTime taken to predict: {:.3g}".format( (end-start)*100 ) )

### Testing our model

In [31]:
rating_predictor("models/rating_predictor.pkl")

Enter your review: This movie is quite long

Your review is : Negative

Time taken to predict: 0.199


In [32]:
rating_predictor("models/rating_predictor.pkl")

Enter your review: Sputters to the finish line with its disjointed pacing, it's a damn good time with some dazzling fight scenes, dark humor, and a banner turn by Robbie.

Your review is : Negative

Time taken to predict: 0


In [None]:
rating_predictor("models/rating_predictor.pkl")