References: 
https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1

In [1]:
#Get data
#Source: Sentiment Polarity Datasets 2.0
import pandas as pd

#Train data
trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
#Test Data
testData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv")

In [2]:
#Looking at the data
trainData.sample(frac=1).head(5) #shuffle and pick 5

Unnamed: 0,Content,Label
1556,"synopsis : melissa , a mentally-disturbed woma...",neg
1056,i heard actor skeet ulrich discussing this fil...,neg
556,"to me , nicolas cage sounds like an ideal choi...",pos
469,"when quentin tarantino made "" pulp fiction "" ,...",pos
997,well arnold has completed the seemingly imposs...,neg


In [3]:
#Vectorizing the data
#Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                            max_df = 0.8,
                            sublinear_tf = True,
                            use_idf = True)

train_vectors = vectorizer.fit_transform(trainData['Content'])
test_vectors = vectorizer.transform(testData['Content'])

In [4]:
#Linear SVM model
import time
from sklearn import svm
from sklearn.metrics import classification_report

#Perform classification with SVM, linear kernel
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

#Results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

report = classification_report(testData['Label'], prediction_linear, output_dict=True)

print('Positive:', report['pos'])
print('Negative:', report['neg'])

Training time: 7.029638s; Prediction time: 0.670842s
Positive: {'precision': 0.9191919191919192, 'recall': 0.91, 'f1-score': 0.9145728643216081, 'support': 100}
Negative: {'precision': 0.9108910891089109, 'recall': 0.92, 'f1-score': 0.9154228855721394, 'support': 100}


In [1]:
#Dump the model and vocabulary
import pickle

#Paths
VOCABULARY_PATH = 'data/output/vectorizer.sav'
MODEL_PATH = 'data/output/model.sav'

In [None]:
#Picling the vectorizer
pickle_out = open(VOCABULARY_PATH, 'wb')
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

#Pickling the model
pickle_out = open(MODEL_PATH, 'wb')
pickle.dump(classifier_linear, pickle_out)
pickle_out.close()

In [3]:
#Extract the model and vocabulary
model_in = open(MODEL_PATH, 'rb')
vectorizer_in = open(VOCABULARY_PATH, 'rb')

classifier_linear = pickle.load(model_in)
vectorizer = pickle.load(vectorizer_in)

In [8]:
#Make a prediction
review = ["TOTAL is the best entrerprise ever!", 'SOGE is a corrupted entreprise.']

review_vector = vectorizer.transform(review) #Vectorize
print(classifier_linear.predict(review_vector))#Predict

['pos' 'neg']
