# Sentiment Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read data

In [None]:
data_train = pd.read_csv('../input/train.csv')
data_test = pd.read_csv('../input/test.csv')

In [None]:
data_train.head()

In [None]:
print(data_train.dtypes)
print(data_train.describe())
print(data_train.info())

In [None]:
data_train.label.value_counts()

In [None]:
print(data_train.shape, data_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
train_vector = vectorizer.fit_transform(data_train.sentence)
test_vector = vectorizer.transform(data_test.sentence)

### Visualize Word Frequency

In [None]:
WordFrequency = pd.DataFrame({'Word': vectorizer.get_feature_names(), 'Count': train_vector.toarray().sum(axis=0)})

In [None]:
WordFrequency['Frequency'] = WordFrequency['Count'] / WordFrequency['Count'].sum()

In [None]:
plt.plot(WordFrequency.Frequency)
plt.xlabel('Word Index')
plt.ylabel('Word Frequency')
plt.show()

### Sort WordFrequency in descending order

In [None]:
WordFrequency_sort = WordFrequency.sort_values(by='Frequency', ascending=False)
WordFrequency_sort.head()

## Model 1: Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score

In [None]:
clf1 = MultinomialNB()

In [None]:
cross_val_acc = cross_val_score(clf1, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(cross_val_acc)
print(cross_val_acc.mean())

In [None]:
clf1.fit(train_vector, data_train.label.values)
predictions = clf1.predict(test_vector)

In [None]:
solution1 = pd.DataFrame(list(zip(data_test.sentence, predictions)), columns=['sentence', 'label'])

In [None]:
solution1.to_csv('./solution1_naive_bayes.csv', index=False)
# Accuracy in testing data: 0.97461

## Model 2: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_jobs=-1)

In [None]:
cross_val_acc2 = cross_val_score(clf2, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(cross_val_acc2)
print(cross_val_acc2.mean())

In [None]:
clf2.fit(train_vector, data_train.label.values)
prediction2 = clf2.predict(test_vector)

In [None]:
solution2 = pd.DataFrame(list(zip(data_test.sentence, prediction2)), columns=['sentence','label'])

In [None]:
solution2.to_csv('./solution2_random_forest.csv', index=False)
# Accuracy in testing data: 0.97884

### Use GridSearchCV

In [None]:
from pprint import pprint
from sklearn.model_selection import GridSearchCV

In [None]:
pprint(clf2.get_params())

In [None]:
param_grid = {
             'class_weight': ['balanced', None],
             'criterion': ['gini', 'entropy'],
             'max_depth': [None, 1, 5, 10],
             'max_features': ['auto', 'log2', None],
             'n_estimators': [5, 10, 20]}
cv_clf2 = GridSearchCV(estimator=clf2, param_grid=param_grid, scoring='accuracy', verbose=0, n_jobs=-1)
cv_clf2.fit(train_vector, data_train.label.values)
best_parameters = cv_clf2.best_params_
print('The best parameters for using RF model is: ', best_parameters)

In [None]:
clf2_balanced_gini = RandomForestClassifier(class_weight='balanced', n_estimators=20)
clf2_entropy = RandomForestClassifier(criterion='entropy', n_estimators=20)
clf2_gini = RandomForestClassifier(n_estimators=20)

In [None]:
RF_score1 = cross_val_score(clf2_balanced_gini, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(RF_score1)
print(RF_score1.mean())

In [None]:
RF_score2 = cross_val_score(clf2_entropy, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(RF_score2)
print(RF_score2.mean())

In [None]:
RF_score3 = cross_val_score(clf2_gini, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(RF_score3)
print(RF_score3.mean())

In [None]:
clf2_balanced_gini.fit(train_vector, data_train.label.values)
prediction2_tuned = clf2_balanced_gini.predict(test_vector)
solution2_tuned = pd.DataFrame(list(zip(data_test.sentence, prediction2_tuned)), columns=['sentence', 'label'])

In [None]:
solution2_tuned.to_csv('./solution2_RF_tuned.csv', index=False)

## Model 3: Logistic Regression (Use GridSearchCV to tune hyper-parameters)

In [None]:
# Use Logistic Regression directly
from sklearn.linear_model import LogisticRegression
clf3_1 = LogisticRegression()

In [None]:
cross_val_acc3_1 = cross_val_score(clf3_1, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(cross_val_acc3_1)
print(cross_val_acc3_1.mean())

### Use GridSearchCV

In [None]:
pprint(clf3_1.get_params())

In [None]:
param_grid = {'penalty': ['l1', 'l2'],
             'class_weight': ['balanced', None],
             'C': [0.1, 1, 10]
             }
clf3_2 = GridSearchCV(estimator=clf3_1, param_grid=param_grid, scoring='accuracy', verbose=1, n_jobs=-1)
clf3_2.fit(train_vector, data_train.label.values)
best_param = clf3_2.best_params_
print('The best parameters for using LR model is: ', best_param)

In [None]:
clf3_2 = LogisticRegression(C=9.4)
cross_val_acc3_2 = cross_val_score(clf3_2, train_vector, data_train.label.values, cv=10, scoring='accuracy')
print(cross_val_acc3_2)
print(cross_val_acc3_2.mean())

In [None]:
clf3_1.fit(train_vector, data_train.label.values)
clf3_2.fit(train_vector, data_train.label.values)
prediction3_1 = clf3_1.predict(test_vector)
prediction3_2 = clf3_2.predict(test_vector)

In [None]:
solution3_origin_LR = pd.DataFrame(list(zip(data_test.sentence, prediction3_1)), columns=['sentence', 'label'])
solution3_CV_LR = pd.DataFrame(list(zip(data_test.sentence, prediction3_2)), columns=['sentence', 'label'])

In [None]:
solution3_origin_LR.to_csv('./solution3_origin_LR.csv', index=False)
# Accuracy in testing data: 0.99083

In [None]:
solution3_CV_LR.to_csv('./solution3_CV_LR.csv', index=False)
# Accuracy in testing data:0.99083

## Model 4: RNN

In [None]:
import os
import sys
import time

class Dataset():
    def __init__(self,data,label):
        self._index_in_epoch = 0
        self._epochs_completed = 0
        self._data = data
        self._label = label
        assert(data.shape[0] == label.shape[0])
        self._num_examples = data.shape[0]
        pass

    @property
    def data(self):
        return self._data
    
    @property
    def label(self):
        return self._label

    def next_batch(self,batch_size,shuffle = True):
        start = self._index_in_epoch
        if start == 0 and self._epochs_completed == 0:
            idx = np.arange(0, self._num_examples)  # get all possible indexes
            np.random.shuffle(idx)  # shuffle indexe
            self._data = self.data[idx]  # get list of `num` random samples
            self._label = self.label[idx]

        # go to the next batch
        if start + batch_size > self._num_examples:
            self._epochs_completed += 1
            rest_num_examples = self._num_examples - start
            data_rest_part = self.data[start:self._num_examples]
            label_rest_part = self.label[start:self._num_examples]
            idx0 = np.arange(0, self._num_examples)  # get all possible indexes
            np.random.shuffle(idx0)  # shuffle indexes
            self._data = self.data[idx0]  # get list of `num` random samples
            self._label = self.label[idx0]

            start = 0
            self._index_in_epoch = batch_size - rest_num_examples #avoid the case where the #sample != integar times of batch_size
            end =  self._index_in_epoch  
            data_new_part =  self._data[start:end]  
            label_new_part = self._label[start:end]
            return np.concatenate((data_rest_part, data_new_part), axis=0),np.concatenate((label_rest_part, label_new_part), axis=0)
        else:
            self._index_in_epoch += batch_size
            end = self._index_in_epoch
            return self._data[start:end],self._label[start:end]

class ProgressBar():
    def __init__(self,worksum,info="",auto_display=True):
        self.worksum = worksum
        self.info = info
        self.finishsum = 0
        self.auto_display = auto_display
    def startjob(self):
        self.begin_time = time.time()
    def complete(self,num):
        self.gaptime = time.time() - self.begin_time
        self.finishsum += num
        if self.auto_display == True:
            self.display_progress_bar()
    def display_progress_bar(self):
        percent = self.finishsum * 100 / self.worksum
        eta_time = self.gaptime * 100 / (percent + 0.001) - self.gaptime
        strprogress = "[" + "=" * int(percent // 2) + ">" + "-" * int(50 - percent // 2) + "]"
        str_log = ("%s %.2f %% %s %s/%s \t used:%ds eta:%d s" % (self.info,percent,strprogress,self.finishsum,self.worksum,self.gaptime,eta_time))
        sys.stdout.write('\r' + str_log)

def get_dataset(paths):
    dataset = []
    for path in paths.split(':'):
        path_exp = os.path.expanduser(path)
        classes = os.listdir(path_exp)
        classes.sort()
        nrof_classes = len(classes)
        for i in range(nrof_classes):
            class_name = classes[i]
            facedir = os.path.join(path_exp, class_name)
            if os.path.isdir(facedir):
                images = os.listdir(facedir)
                image_paths = [os.path.join(facedir,img) for img in images]
                dataset.append(ImageClass(class_name, image_paths))
  
    return dataset

class ImageClass():
    "Stores the paths to images for a given class"
    def __init__(self, name, image_paths):
        self.name = name
        self.image_paths = image_paths
  
    def __str__(self):
        return self.name + ', ' + str(len(self.image_paths)) + ' images'
  
    def __len__(self):
        return len(self.image_paths)

def split_dataset(dataset, split_ratio, mode):
    if mode=='SPLIT_CLASSES':
        nrof_classes = len(dataset)
        class_indices = np.arange(nrof_classes)
        np.random.shuffle(class_indices)
        split = int(round(nrof_classes*split_ratio))
        train_set = [dataset[i] for i in class_indices[0:split]]
        test_set = [dataset[i] for i in class_indices[split:-1]]
    elif mode=='SPLIT_IMAGES':
        train_set = []
        test_set = []
        min_nrof_images = 2
        for cls in dataset:
            paths = cls.image_paths
            np.random.shuffle(paths)
            split = int(round(len(paths)*split_ratio))
            if split<min_nrof_images:
                continue  # Not enough images for test set. Skip class...
            train_set.append(ImageClass(cls.name, paths[0:split]))
            test_set.append(ImageClass(cls.name, paths[split:-1]))
    else:
        raise ValueError('Invalid train/test split mode "%s"' % mode)
    return train_set, test_set

In [None]:
import collections
import tensorflow as tf
%matplotlib inline

In [None]:
data_train[data_train['label'] == 1].sentence.values

In [None]:
pos_lines = []
pess_lines = []

pb = ProgressBar(worksum=10000000)
pb.startjob()

for pos_line in data_train[data_train['label'] == 1].sentence.values:
    pos_linesp = [i.lower() for i in pos_line.strip("\n").split()]
    pos_lines.append(pos_linesp)

for pess_line in data_train[data_train['label'] == 0].sentence.values:
    pess_linesp = [i.lower() for i in pess_line.strip("\n").split()]
    pess_lines.append(pess_linesp)




# if len(pos_linesp) <= sentlength and len(pess_linesp) <= sentlength:
#     pos_lines.append(pos_linesp)
#     pess_lines.append(pess_linesp)
# if (num // 2) % 1000 == 0:
#     pb.complete(1000)
            
        #if num // 2 >= 1000000:
        #    break

In [None]:
len(pos_lines), len(pess_lines)

In [None]:
words = []

for sent in pos_lines:
    for word in sent:
        words.append(word)
        
for sent in pess_lines:
    for word in sent:
        words.append(word)

In [None]:
from collections import Counter

In [None]:
Counter(words).most_common(15)

In [None]:
def addchar(what2ind,ind2what,char):
    if char in what2ind:
        return 
    ind2what[len(what2ind)] = char
    what2ind[char] = len(what2ind)

In [None]:
# to be Done!!!