#Build a logistic regression model to classify movie reviews as either positive or negative.
# Task 1 LOAD THE DATASET 


In [4]:
import pandas as pd 

df=pd.read_csv('/content/data/moviedata.csv', engine='python', encoding='utf-8', error_bad_lines=False )
print(df.head(10))


                                              review  sentiment
0  In 1974, the teenager Martha Moxley (Maggie Gr...          1
1  OK... so... I really like Kris Kristofferson a...          0
2  ***SPOILER*** Do not read this, if you think a...          0
3  hi for all the people who have seen this wonde...          1
4  I recently bought the DVD, forgetting just how...          0
5  Leave it to Braik to put on a good show. Final...          1
6  Nathan Detroit (Frank Sinatra) is the manager ...          1
7  To understand "Crash Course" in the right cont...          1
8  I've been impressed with Chavez's stance again...          1
9  This movie is directed by Renny Harlin the fin...          1


Skipping line 26458: unexpected end of data


# Task 2 transforming documents to feature vectors

In [5]:
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer

count=CountVectorizer()
docs =np.array(['The sun is shining','The weather is sweet','The sun is shining,The weather is sweet,and one and one is two'])
bag=count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


# Task 3 TF IDF

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf=TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
print(tfidf.fit_transform(bag).toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


# Task 4 Data preparation 

In [7]:
print(df.loc[0,'review'][-50:])

import  re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text)
    text = re.sub('[\\W]+', ' ', text.lower()) +\
         ' '.join(emoticons).replace('-', '')
    return text

print(preprocessor(df.loc[0,'review'][-50:]))

print(preprocessor("</a>This :) is a :( test :-)!"))


df['review']=df['review'].apply(preprocessor)

is seven.<br /><br />Title (Brazil): Not Available
is seven title brazil not available
this is a test :) :( :)


#  Task 5 tokenization of documents 

In [8]:

from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return[porter.stem(word) for word in text.split()]

print(tokenizer('runners like running and thus they run'))
print(tokenizer_porter('runners like running and thus they run'))


import nltk

nltk.download('stopwords')


from nltk.corpus import stopwords

stop =stopwords.words('english')
[w for w in tokenizer_porter ('a runner likes running and runs a lot ')[-10:] if w not in stop ]



['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['runner', 'like', 'run', 'run', 'lot']

# Task 6 transform text data into tf idf vectors 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(
    strip_accents=None,
    lowercase=False,
    preprocessor=None,
    tokenizer=tokenizer_porter,
    use_idf=True,
    norm='l2',
    smooth_idf=True
    )

y=df.sentiment.values
X=tfidf.fit_transform(df.review)

# Task 7 Document classification using logistic regression

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.5,shuffle=False)

import pickle
from sklearn.linear_model import LogisticRegressionCV

clf=LogisticRegressionCV(cv=5,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=300).fit(x_train,y_train)
saved_model=open('saved_model.sav','wb')  
pickle.dump(clf,saved_model)
saved_model.close()        



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


# Task 8 Model Evaluation 

In [11]:
filename='saved_model.sav'
saved_clf=pickle.load(open(filename,'rb'))

saved_clf.score(x_test,y_test)




0.8858482007862111