In [12]:
import os
import random
import joblib
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

### Data Class

In [13]:
class Review:
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def equal_split(self):
        negatives = list(filter(lambda x: x.sentiment == 'positive', self.reviews))
        positives = list(filter(lambda x: x.sentiment == 'negative', self.reviews))
        positives_shrunk = positives[:len(negatives)]
        self.reviews = negatives + positives_shrunk
        random.shuffle(self.reviews)

### Loading Data

In [14]:
df = pd.read_csv(os.path.join('Data', 'data.csv'))

In [15]:
reviews = []

for idx, review in enumerate(df['review']):
    soup = BeautifulSoup(review, 'html.parser')
    text = soup.get_text()
    reviews.append(Review(text, df['sentiment'][idx]))

### Split Dataset

In [16]:
train, test = tts(reviews, test_size=0.4, random_state=42, shuffle=True)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [17]:
train_container.equal_split()
X_train = train_container.get_text()
y_train = train_container.get_sentiment()

test_container.equal_split()
X_test = test_container.get_text()
y_test = test_container.get_sentiment()

In [18]:
y_train.count('negative'), y_train.count('positive')

(14989, 14989)

### Vecorizing Reviews

In [25]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

### Training Model

In [26]:
clf = LinearSVC(max_iter=3500)
clf.fit(X_train_vectors, y_train)

LinearSVC(max_iter=3500)

In [27]:
clf_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, clf_pred)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 0.85975


### Saving the Model

In [28]:
joblib.dump(clf, os.path.join('models', 'review_classifier.pkl'))

['models\\review_classifier.pkl']

### Predictions

In [33]:
loaded_clf = joblib.load(os.path.join('models', 'review_classifier.pkl'))

text = 'the movie is awesome!'
test_vector = vectorizer.transform([text])
loaded_clf.predict(test_vector)

array(['positive'], dtype='<U8')