# _*Exploration and Preparation*_

In [1]:
import textblob as tb
from wordcloud import WordCloud
import re 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 


### Description

##### The core dataset contains 50,000 reviews of movies from IMDB split evenly into 25k train and 25k test sets. The overall distribution of labels is balanced (25k pos and 25k neg)
##### In the entire collection, no more than 30 reviews are allowed for any given movie because reviews for the same movie tend to have correlated ratings. Further, the train and test sets contain a disjoint set of movies, so no significant performance is obtained by memorizing movie-unique terms and their associated with observed labels.  In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10. Thus reviews with more neutral ratings are not included in the train/test sets.

### Exploration 

In [2]:
train_data = pd.read_csv('Train_reviews.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24984 entries, 0 to 24983
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   24984 non-null  int64 
 1   Text_Review  24984 non-null  object
 2   Sentiment    24984 non-null  object
dtypes: int64(1), object(2)
memory usage: 585.7+ KB


In [3]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text_Review,Sentiment
0,0,['Bromwell High is a cartoon comedy. It ran at...,positive
1,1,['Homelessness (or Houselessness as George Car...,positive
2,2,['Brilliant over-acting by Lesley Ann Warren. ...,positive
3,3,['This is easily the most underrated film inn ...,positive
4,4,['This is not the typical Mel Brooks film. It ...,positive


In [4]:
train_data = train_data.drop('Unnamed: 0',axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24984 entries, 0 to 24983
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text_Review  24984 non-null  object
 1   Sentiment    24984 non-null  object
dtypes: object(2)
memory usage: 390.5+ KB


In [5]:
a = train_data.sample(10)
a

Unnamed: 0,Text_Review,Sentiment
24870,"[""Okay so i found out about this movie and I w...",negative
18766,"[""We went to the movie with a group because th...",negative
17569,"[""I saw this by accident one lazy summer after...",negative
20586,"[""Where to start, this movie started badly and...",negative
18103,"[""Err...this movie sucked. A LOT.<br /><br />I...",negative
2382,"[""Return To The 3th Chamber is the comedic seq...",positive
3341,"[""Only once in a while do we get an R-rated co...",positive
22101,"[""I think they really let the quality of the D...",negative
20370,"[""I enjoy quality crapness, and this ranks up ...",negative
5427,['This movie has several things going for it. ...,positive


In [6]:
j=1
for i,k in zip(a.Text_Review,a.Sentiment):
    print(f'{j}) {(i)} - {k}')
    j+=1

1) ["Okay so i found out about this movie and I watched the preview read almost all the reviews and was having a hard time debating whether I should watch it or not. Before i even watched the movie i was emotionally weird on it. i was so unsure if i was going to watch this and be disturbed for like a long time. So i choose to risk it and watched it and heres what i thought: The beginning started off fine for me. It seemed to be heading in a decent direction. Got past the rape scene and i couldn't figure why people were so disturbed or bored by the movie. Don't get me wrong the rape scene was just as sad and scary but it didn't really bother me to a dramatic point. Then as the middle came in i understood the boring stuff that was going on. There was like 5 minutes shots of nothing but people walking around saying or showing nothing! its one thing to have a shot where a person is showing some kind of emotion but this movie didn't have that. It had about 3 of these pointless scenes, where

###  Observation for Cleaning 
##### The only thing seen from the ten random sqmples that had to be cleaned is the \<br \/> and also the [ ] and " " 

## Cleaning 

In [7]:
def cln_txt(text):
    text = text[2:-2]
    text = re.sub('<br />','',text)
    text = text.lower()
    return text
train_data.Text_Review = train_data['Text_Review'].apply(cln_txt)
a = train_data.sample(10)
a

Unnamed: 0,Text_Review,Sentiment
10220,i really liked the film.at ending i was in tea...,positive
18306,who in their right mind plays a lyrical song a...,negative
21082,what seemed as a good premise for a movie...un...,negative
12013,wow. if you think that a film can't fatigue in...,positive
1785,i saw this when it premiered and just re-watch...,positive
21565,absolutely dreadful mexican film supposedly ba...,negative
13242,"i didn\'t expect a movie as good as ""in the li...",negative
4793,"""panic"" is a captivating, blurred-genre film a...",positive
3105,"there are movies, and there are films. movies ...",positive
16126,my wife and i are semi amused by howie mandel\...,negative


In [None]:
train_data.to_csv('Train_reviews.csv',index=False)

### Feature Extraction 

In [12]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\HRITHIK
[nltk_data]     REDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
eng_stopwords = set(stopwords.words('english'))

In [17]:
def feature_extract(text):
    # tokenize 
    tokens = tokenizer.tokenize(text)
    # Stop word Removal
    new_tokens = [token for token in tokens if token not in eng_stopwords]
    # Stemming 
    stemmed_tokens =[stemmer.stem(i) for i in new_tokens]
    clean_sen = " ".join(stemmed_tokens)
    return clean_sen 

In [18]:
a = train_data['Text_Review'].apply(feature_extract)
a

0        bromwel  high  cartoon  comedi  ran  time  pro...
1        homeless  houseless  georg  carlin  state  iss...
2        brilliant  act  lesley  ann  warren  best  dra...
3        easili  underr  film  inn  brook  cannon  sure...
4        typic  mel  brook  film  much  less  slapstick...
                               ...                        
24979    toward  end  movi  felt  technic  felt  like  ...
24980    kind  movi  enemi  content  watch  time  blood...
24981    saw  descent  last  night  stockholm  film  fe...
24982    film  pick  pound  turn  rather  good  23rd  c...
24983    one  dumbest  film  ever  seen  rip  nearli  e...
Name: Text_Review, Length: 24984, dtype: object

#### Feature Engineering

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,6))

In [24]:
b = cv.fit_transform(a)

In [49]:
train_data['vector_text'] = cv.transform(a).todense

In [50]:
train_data['vector_text'][1]

<bound method spmatrix.todense of <24984x1591107 sparse matrix of type '<class 'numpy.int64'>'
	with 5283118 stored elements in Compressed Sparse Row format>>

In [44]:
class_encode = {'positive':1,'negative':0}
X_train = train_data['vector_text']
y_train = train_data['Sentiment'].apply(lambda x: class_encode[x])