# Import Library

In [1]:
import pandas as pd
import numpy as np
np.random.seed(9)

# Data Loading

In [2]:
data = pd.read_csv('sentiments.tsv', sep='\t', header=None)
data.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# 1. Data Split

In [3]:
def data_split(data):

    split_len = np.random.rand(len(data)) < .8

    train = data[split_len]
    test = data[~split_len]

    return train, test


train, test = data_split(data)
print('Train Data Shape = {}\nTest  Data Shape = {}'.format(train.shape, test.shape))

Train Data Shape = (654, 2)
Test  Data Shape = (155, 2)


# Data Preprocess
## 1. Same Class Data Merge

In [4]:
def merge_data(train):

    train_class = train[1].unique()

    process_data = pd.DataFrame(
        index=train_class, columns=['string', 'y_count'])

    for i in train_class:
        process_data.iloc[i][0] = ' '.join(train[train[1] == i][0])
        process_data.iloc[i][1] = train[train[1] == i].shape[0]
    return process_data


train_merge = merge_data(train)
print(train_merge)

                                              string y_count
0  A very, very, very slow-moving, aimless movie ...     321
1  The best scene in the movie was when Gerardo i...     333


# Data Preprocess
## 2. Remove Special Character from Sentence

In [5]:
def preprocess(data):

    new_data = pd.DataFrame(data)
    new_data.iloc[:, 0] = new_data.iloc[:, 0].str.lower()

    for i in [',', '-', '.', '?', '(', ')', '"']:
        new_data.iloc[:, 0] = new_data.iloc[:, 0].str.replace(i, ' ')

    new_data.iloc[:, 0] = new_data.iloc[:, 0].map( lambda x: ' '.join(x.split()))

    return new_data


train_pre = preprocess(train_merge)
train_pre.head()

Unnamed: 0,string,y_count
0,a very very very slow moving aimless movie abo...,321
1,the best scene in the movie was when gerardo i...,333


# 2. Vocabulary on Training Data Set
## 2.1 Create Vocabulary Dictionary

In [6]:
def vocabulary(train_pre):

    data_vocab = {}

    for i in train_pre.iloc[:, 0]:

        for j in i.split():
            try:
                data_vocab[j] += 1
            except:
                data_vocab[j] = 1

    return data_vocab


data_vocab = vocabulary(train_pre)
feature_name = list(data_vocab)
len(data_vocab)

2701

# 2. Vocabulary on Training Data Set
## 2.2 Create Vocabulary Feature Matrix

In [7]:
def feature_mat(feature_name, train_pre):

    x = np.zeros((train_pre.shape[0], len(feature_name)))
    try:
        y = np.array([train_pre.iloc[:, 1].values]).T
        zero_data = np.concatenate([x, y], axis=1)
        train_mat = pd.DataFrame(zero_data, columns=feature_name+['Y'])

    except:
        x = np.zeros((train_pre.shape[0], len(feature_name)))
        train_mat = pd.DataFrame(x, columns=feature_name)

    k = 0
    for i in train_pre.iloc[:, 0]:
        for j in i.split():
            try:
                train_mat[j][k] += 1
            except:
                train_mat[j][k] = 1
        k += 1

    return train_mat


train_mat = feature_mat(feature_name, train_pre)
print(train_mat)

     a very slow moving aimless movie about distressed drifting young  ...  :  \
0  118   25    5      3       1    59    17          1        1     1  ...  0   
1  207   24    1      1       0    73    17          0        0     2  ...  1   

  anyway flowed smoothly bonding hoot cat  n delight    Y  
0      0      0        0       0    0   0  0       0  321  
1      1      1        1       1    1   1  1       1  333  

[2 rows x 2702 columns]


# (3,4). Probability Calculate
## 3. Prior Probability for Class
## 4. Conditional Probability of Vocabulary for Each Class

In [8]:
def prior_calculate(train_mat, feature_name):

    V = len(feature_name)
    total_class = train_mat.index.values

    row = range(total_class.shape[0])
    column = len(feature_name)

    prior_mat = np.zeros((len(row), column+1))

    prior_mat = pd.DataFrame(
        prior_mat, index=total_class, columns=feature_name+['Y'])
    total_output = train_mat.iloc[:, -1].sum()

    for k, i in enumerate(total_class):

        sample = train_mat.iloc[i, :-1]
        total_word = sample.sum()

        for j in feature_name:
            prior_mat.iloc[k][j] = (sample[j]+1)/(total_word+V)

        prior_mat.iloc[k]['Y'] = train_mat.iloc[i, -1]/total_output

    return prior_mat, total_class


prior_prob, total_class = prior_calculate(train_mat, feature_name)
prior_prob

Unnamed: 0,a,very,slow,moving,aimless,movie,about,distressed,drifting,young,...,:,anyway,flowed,smoothly,bonding,hoot,cat,n,delight,Y
0,0.016221,0.003544,0.000818,0.000545,0.000273,0.008179,0.002454,0.000273,0.000273,0.000273,...,0.000136,0.000136,0.000136,0.000136,0.000136,0.000136,0.000136,0.000136,0.000136,0.490826
1,0.023194,0.002788,0.000223,0.000223,0.000112,0.008252,0.002007,0.000112,0.000112,0.000335,...,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.509174


# Test data preprocess

In [9]:
test_pre = preprocess(test)
test_pre.head()

Unnamed: 0,0,1
11,the movie showed a lot of florida at it's best...,1
12,the songs were the best and the muppets were s...,1
16,this review is long overdue since i consider a...,1
22,how can anyone in their right mind ask for any...,1
24,yes this film does require a rather significan...,1


# 5. Posterior Probability for Each Class for given Test Instance

## Calculate TP,TN,FP,FN for All Test Instance

In [10]:
def posterior(feature_row, prior_prob, train_mat):

    data_vocab = vocabulary(feature_row)
    feature_name = list(data_vocab)
    length = prior_prob.shape[1]

    test_mat = feature_mat(feature_name, feature_row)

    res = {}
    for i in prior_prob.index.values:

        total_y = train_mat['Y'][i]

        res[i] = prior_prob.iloc[i, -1]

        for j in feature_name:
            try:
                res[i] *= prior_prob.loc[i][j]**test_mat[j][0]
            except:
                res[i] *= (1/(total_y+length))**test_mat[j][0]

    return sorted(res, key=lambda x: res[x], reverse=True)[0]


TP, TN, FP, FN = 0, 0, 0, 0

for i in range(test.shape[0]):

    x = (posterior(test.iloc[i:i+1, :-1],
                   prior_prob, train_mat), test.iloc[i, -1])
    if x[0] == x[1]:
        TP, TN = (TP+1, TN+0) if x[0] == 1 else (TP+0, TN+1)
    else:
        FP, FN = (FP+1, FN+0) if x[0] == 1 else (FP+0, FN+1)

print('TP = {}\nTN = {}\nFP = {}\nFN = {}'.format(TP, TN, FP, FN))

TP = 73
TN = 48
FP = 19
FN = 15


# Measure Performance of This Model

In [11]:
def performance(TP, TN, FP, FN):
    precision = 100*TP/(TP+FP)
    recall = 100*TP/(TP+FN)
    f1_score = (2*precision*recall)/(precision+recall)
    accuracy = 100*(TP+TN)/(TP+TN+FP+FN)

    return precision, recall, f1_score, accuracy


print('Precision = {:.2f}%\nRecall    = {:.2f}%\nF1 Score  = {:.2f}%\nAccuracy  = {:.2f}%'.format(*performance(TP, TN, FP, FN)))

Precision = 79.35%
Recall    = 82.95%
F1 Score  = 81.11%
Accuracy  = 78.06%
