References:   
https://www.kaggle.com/alokmalik/text-classification-using-svm    
https://github.com/YangLinyi/SVM-CNN-RNN-HAN-Popular-NLP-Models    

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split #split data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC# Support Vector Machine
from sklearn.pipeline import Pipeline #pipeline to implement steps in series
from gensim import parsing # To stem data
from sklearn import metrics 
from copy import deepcopy

**Pre-defined Evaluation Functions**

In [2]:
def Get_Accuracy(y_true, y_pred): #Accuracy 准确率：分类器正确分类的样本数与总样本数之比 
    accuracy = metrics.accuracy_score(y_true,y_pred)  
    return accuracy

def Get_Precision_score(y_true, y_pred): #Precision：精准率 正确被预测的正样本(TP)占所有被预测为正样本(TP+FP)的比例. 
    precision = metrics.precision_score(y_true,y_pred)  
    return precision

def Get_Recall(y_true, y_pred): #Recall 召回率 正确被预测的正样本(TP)占所有真正 正样本(TP+FN)的比例.  
    Recall = metrics.recall_score(y_true,y_pred)  
    return Recall 
 
def Get_f1_score(y_true, y_pred): #F1-score: 精确率(precision)和召回率(Recall)的调和平均数  
    f1_score = metrics.f1_score(y_true,y_pred)  
    return f1_score

**Read Data**

In [4]:
original_train_path = 'train.tsv'
train_path = 'train.tsv'
test_path = 'test.tsv'

train_df = pd.read_csv(train_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')
original_train_df = pd.read_csv(original_train_path, sep='\t')

In [5]:
original_train_df = original_train_df.sample(frac=1).reset_index(drop=True)
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

**Print dataframe to check the shape**  
Back part of this jupyter used some specific column names only for this specific dataset. So check your dataset to make sure use it in the correct shape.

In [6]:
train_df = train_df.append(original_train_df)

In [7]:
train_df

Unnamed: 0.1,Unnamed: 0,Sentiment,Text
0,1551.0,Positive,"""""""""""""""A Mouse in the House"""""""" is a very clas..."
1,3104.0,Negative,this is actually called the strange world...to...
2,1290.0,Positive,"""""""""""""""The Man in the Moon"""""""" is a beautifull..."
3,280.0,Negative,"""""""Fairly interesting exploitation flick in bl..."
4,3373.0,Positive,not a bad idea.somehow i was supposed to reall...
...,...,...,...
1702,,Positive,"This was an excellent 2-part episode, although..."
1703,,Positive,"To many people, Beat Street has inspired their..."
1704,,Negative,"When I had first heard of ""Solar Crisis"" then ..."
1705,,Positive,The final chapter in the Hanzo the Razor trilo...


In [8]:
test_df

Unnamed: 0,Sentiment,Text
0,Positive,I think this movie is well done and realistic....
1,Negative,"how can you take her hard-living, glamorously ..."
2,Positive,"The whole point of making this film, one of th..."
3,Positive,"This film is not morbid, nor is it depressing...."
4,Negative,It seems that Salvatores couldn't decide what ...
...,...,...
483,Positive,"This cartoon is short on plot, but is a visual..."
484,Positive,I have had the pleasure of reading Martin Torg...
485,Negative,"Roy Rogers and company try to bring ""Sintown"" ..."
486,Positive,Dead To Rights is about a Police Officer named...


**Cleansing label to numeric**

In [9]:
def clean_target(target_col):
    new_labels = []
    for each in target_col:
        if each == 'Negative':
            new_labels.append(0)
        elif each == 'Positive':
            new_labels.append(1)
    return new_labels

train_df['Sentiment'] = clean_target(train_df['Sentiment'])
test_df['Sentiment'] = clean_target(test_df['Sentiment'])    

**Define feature and target**

In [12]:
X_train = train_df['Text'].to_list()
y_train = train_df['Sentiment'].to_list()
X_test = test_df['Text'].to_list()
y_test = test_df['Sentiment'].to_list()

**Build NB and predict**    
pipeline version

In [13]:
#Use pipeline to carry out steps in sequence with a single object
#SVM's rbf kernel gives highest accuracy in this classification problem.
# vect settings: CountVectorizer() and TfidfVectorizer()
text_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('clf', MultinomialNB())])

#train model
text_clf.fit(X_train, y_train)

#predict class form test data 
predicted = text_clf.predict(X_test)

**Evaluation**

In [15]:
# f = open("result.txt","a")
accuracy = Get_Accuracy(y_test, predicted)
# f.write("SVM Accuracy_Score = %f",accuracy)
precision = Get_Precision_score(y_test, predicted)
# f.write("SVM Precision = %f",precision)
recall = Get_Recall(y_test, predicted)
# f.write("SVM Recall = %f",recall)
f1_score = Get_f1_score(y_test, predicted)
print("NB evaluation Result: Accuracy {:.2%}  ".format(accuracy), "Precision {:.2%}  ".format(precision),\
      "Recall {:.2%}  ".format(recall), "F1-Score {:.2%}  ".format(f1_score))
# f.write("SVM F1-Score  = %f",f1_score)
# f.close()

NB evaluation Result: Accuracy 80.53%   Precision 75.17%   Recall 91.43%   F1-Score 82.50%  
