# _*Exploration and Preparation*_

In [3]:
import textblob as tb
from wordcloud import WordCloud
import re 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

### Description

##### The core dataset contains 50,000 reviews of movies from IMDB split evenly into 25k train and 25k test sets. The overall distribution of labels is balanced (25k pos and 25k neg)
##### In the entire collection, no more than 30 reviews are allowed for any given movie because reviews for the same movie tend to have correlated ratings. Further, the train and test sets contain a disjoint set of movies, so no significant performance is obtained by memorizing movie-unique terms and their associated with observed labels.  In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10. Thus reviews with more neutral ratings are not included in the train/test sets.

### Exploration 

In [4]:
train_data = pd.read_csv('Train_reviews.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24984 entries, 0 to 24983
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   24984 non-null  int64 
 1   Text_Review  24984 non-null  object
 2   Sentiment    24984 non-null  object
dtypes: int64(1), object(2)
memory usage: 585.7+ KB


In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text_Review,Sentiment
0,0,['Bromwell High is a cartoon comedy. It ran at...,positive
1,1,['Homelessness (or Houselessness as George Car...,positive
2,2,['Brilliant over-acting by Lesley Ann Warren. ...,positive
3,3,['This is easily the most underrated film inn ...,positive
4,4,['This is not the typical Mel Brooks film. It ...,positive


In [6]:
train_data = train_data.drop('Unnamed: 0',axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24984 entries, 0 to 24983
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text_Review  24984 non-null  object
 1   Sentiment    24984 non-null  object
dtypes: object(2)
memory usage: 390.5+ KB


In [7]:
a = train_data.sample(10)
a

Unnamed: 0,Text_Review,Sentiment
17368,['The tragedy of the doomed ship Titanic has i...,negative
10031,"[""I love Ashley Judd and think all of her movi...",positive
4355,"[""This film takes you on one family's impossib...",positive
2251,['I like the movie. Twisted Desire had Jeremy ...,positive
18369,"['From a plot and movement standpoint, this mo...",negative
12314,['There are very few movies that are so funny ...,positive
3133,['Although the beginning of the movie in New Y...,positive
23198,['Ostensibly this is a Z-grade DTV horror film...,negative
13632,"[""Anyone who visited drive-ins in the 1950s, 6...",negative
22722,['CREEPSHOW 2 is the ill-fated sequel to the G...,negative


In [8]:
j=1
for i,k in zip(a.Text_Review,a.Sentiment):
    print(f'{j}) {(i)} - {k}')
    j+=1

2) ["I love Ashley Judd and think all of her movies are great. Ruby<br /><br />in Paradise is one of her best. It is a very understated movie that you really have to watch close to appreciate it. A story of a woman trying to make it on her own and refusing to give in to temptations that would make her life easy. Some of her movies such as Kiss The Girls and Time to Kill probably did better at the box office and video rentals. They were very good movies<br /><br />also, but take the time to really look at Ruby and I think you will agree it is one of Ashley's Best.<br /><br />"] - positive
3) ["This film takes you on one family's impossible journey, and makes you feel every step of their odyssey. Beautifully acted and photographed, heartbreakingly real. Its last line, with its wistful hope, is one of the more powerful in memory."] - positive
4) ['I like the movie. Twisted Desire had Jeremy Jordan,one of my favorite and one of the cutest actors ever. Melissa Joan Hart is a good actress. I

###  Observation for Cleaning 
##### The only thing seen from the ten random sqmples that had to be cleaned is the \<br \/> and also the [ ] and " " 

## Cleaning 

In [9]:
def clean_txt(text):
    text = re.sub('<br />','',text)
    text = text.replace('["',"")
    text = text.replace('"]',"")
    text = text.replace("['","")
    text = text.replace("']","")
    text = text.lower()
    
    
    # text = re.sub(r'"','',text)
    # text = re.sub(r'"','',text)
    # text = re.sub(r'[','',text)
    # text = re.sub(r']','',text)
    return text

In [10]:
train_data.Text_Review = train_data['Text_Review'].apply(clean_txt)
a = train_data.sample(10)

In [11]:
a

Unnamed: 0,Text_Review,Sentiment
17492,i saw this film in its premier week in 1975. i...,negative
13923,sheesh! what a dreadful movie. dodgy camera wo...,negative
9831,"it's not well shot, well written or well acted...",positive
7474,"yes, the cameras were in the right place at th...",positive
1074,"i knew about but had never seen grey gardens, ...",positive
18946,"that\'s right! under 9 on average, but maybe u...",negative
24921,after watching about half of this movie i noti...,negative
18126,"damn, i thought i'd seen some bad westerns. ca...",negative
15699,this film is not funny. it is not entertaining...,negative
4209,the invisible ray is an excellent display of b...,positive


### Feature Extraction 

In [12]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\HRITHIK
[nltk_data]     REDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
eng_stopwords = set(stopwords.words('english'))

In [17]:
def feature_extract(text):
    # tokenize 
    tokens = tokenizer.tokenize(text)
    # Stop word Removal
    new_tokens = [token for token in tokens if token not in eng_stopwords]
    # Stemming 
    stemmed_tokens =[stemmer.stem(i) for i in new_tokens]
    clean_sen = " ".join(stemmed_tokens)
    return clean_sen 

In [18]:
a = train_data['Text_Review'].apply(feature_extract)
a

0        bromwel  high  cartoon  comedi  ran  time  pro...
1        homeless  houseless  georg  carlin  state  iss...
2        brilliant  act  lesley  ann  warren  best  dra...
3        easili  underr  film  inn  brook  cannon  sure...
4        typic  mel  brook  film  much  less  slapstick...
                               ...                        
24979    toward  end  movi  felt  technic  felt  like  ...
24980    kind  movi  enemi  content  watch  time  blood...
24981    saw  descent  last  night  stockholm  film  fe...
24982    film  pick  pound  turn  rather  good  23rd  c...
24983    one  dumbest  film  ever  seen  rip  nearli  e...
Name: Text_Review, Length: 24984, dtype: object

#### Feature Engineering

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,6))

In [24]:
b = cv.fit_transform(a)

In [49]:
train_data['vector_text'] = cv.transform(a).todense

In [50]:
train_data['vector_text'][1]

<bound method spmatrix.todense of <24984x1591107 sparse matrix of type '<class 'numpy.int64'>'
	with 5283118 stored elements in Compressed Sparse Row format>>

In [44]:
class_encode = {'positive':1,'negative':0}
X_train = train_data['vector_text']
y_train = train_data['Sentiment'].apply(lambda x: class_encode[x])

In [47]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()

In [48]:
MNB.fit(X_train,y_train)

TypeError: float() argument must be a string or a number, not 'method'