In [13]:
import random
import pandas as pd
import numpy as np

from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
import nltk.sentiment.util as sentimentutils
from sklearn import svm
from sklearn import linear_model

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

import sklearn.metrics as metrics

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import shuffle

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

In [2]:
read_file = lambda x : [tuple(line.strip().split("\t")) for line in open(x).readlines() if len(line.strip()) > 0]
assign_class = lambda x, y: [l + (y, ) for l in x]

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

# Reading Data for Sentiment Analysis

In [3]:
pos_class, neg_class = 'P', 'N'
pos_lines = assign_class(read_file("hotelPosT-train.txt"), pos_class)
neg_lines = assign_class(read_file("hotelNegT-train.txt"), neg_class)

sentiment_lines = pos_lines + neg_lines
raw_sentiment_pd = pd.DataFrame(sentiment_lines, columns = ["id", "sentence", "class"])
raw_sentiment_pd = shuffle(raw_sentiment_pd)

# Pipeline for Sentiment Analysis

In [4]:
class SentimentTokenizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, sentiments):
        features = []
        stemmer = PorterStemmer()
        for sentiment in sentiments:

            feature = []
            sentences = nltk.sent_tokenize(sentiment)

            for sentence in sentences:
                tokens = [
                    stemmer.stem(token)
                    for token in nltk.word_tokenize(sentence)
                ]
                feature.extend([
                    token.lower()
                    for token in sentimentutils.mark_negation(
                        tokens, double_neg_flip=True) if token.isalpha()
                ])

            features.append(" ".join(feature))
        return features

In [5]:
sentiment_pipeline = Pipeline([
    ('tokenizer', SentimentTokenizer()),
    ('bag_of_words', TfidfVectorizer(min_df = 0.0, max_df = 1.0)),
    ("clf", BernoulliNB())
])

sentiments = raw_sentiment_pd["sentence"].tolist()
sentiment_labels = raw_sentiment_pd["class"].tolist()

kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 5)

accuracy = []
for train_idx, test_idx in kf.split(sentiments, sentiment_labels):
    X_train, X_test = [sentiments[i]
                       for i in train_idx], [sentiments[i] for i in test_idx]
    y_train, y_test = [sentiment_labels[i]
                       for i in train_idx], [sentiment_labels[i] for i in test_idx]
    sentiment_pipeline.fit(X_train, y_train)
    prediction = sentiment_pipeline.predict(X_test)
    accuracy.append(np.sum(y_test == prediction) * 1.0 / len(y_test))
print np.mean(accuracy)

sentiment_classifier = sentiment_pipeline.fit(sentiments, sentiment_labels)

0.936699857752


# Reading Data for Deception Detection

In [6]:
true_class, false_class = 'T', 'F'

#  Reading the data and converting into DataFrame

true_lines = assign_class(read_file("./hotelT-train.txt"), true_class)
false_lines = assign_class(read_file("./hotelF-train.txt"), false_class)

all_lines = true_lines + false_lines
raw_data_pd = pd.DataFrame(all_lines, columns=["id", "sentence", "class"])
raw_data_pd = shuffle(raw_data_pd)

# Pipeline for Deception Detection

In [43]:
class OpinionExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, opinions):
        features = np.recarray(
            shape=(len(opinions), ),
            dtype=[('opinion', object), ('tokens_per_sent', object), ('pos_per_sent', object),
                   ('tokens', object), ('pos', object), ('sent_seg_opinion', object),
                   ('sent_seg_opinion_token', object), ('sent_seg_opinion_token_pos', object)])
        for i, opinion in enumerate(opinions):
            features['opinion'][i] = opinion
            features['sent_seg_opinion'][i] = nltk.sent_tokenize(opinion)
            features['sent_seg_opinion_token'][i] = [
                map(str.lower, nltk.word_tokenize(sent))
                for sent in features['sent_seg_opinion'][i]
            ]
            features['sent_seg_opinion_token_pos'][i] = nltk.pos_tag_sents(
                features['sent_seg_opinion_token'][i])
            features['tokens_per_sent'][i], features['pos_per_sent'][i] = [], []

            for sent in features['sent_seg_opinion_token_pos'][i]:
                features['tokens_per_sent'][i].append(" ".join(
                    [token for token, pos in sent if token.isalpha()]))
                features['pos_per_sent'][i].append(" ".join(
                    [pos for token, pos in sent]))

            features['tokens'][i] = " ".join(features['tokens_per_sent'][i])
            features['pos'][i] = " ".join(features['pos_per_sent'][i])
        return features


class OpinionStats(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, opinions_sents):
        stats = [{
            'length': sum(map(len, opinion_sents)),
            'num_sentences': len(opinion_sents)
        } for opinion_sents in opinions_sents]
        return stats


class SentimentExtractor(BaseEstimator, TransformerMixin):
        
    def fit(self, sentiments,labels):
        self.classifier = sentiment_pipeline.fit(sentiments, labels)
        return self

    def transform(self, opinions):
        return [{"sentiment": sentiment} for sentiment in self.classifier.predict(opinions)]

cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose = 0)
pipeline = Pipeline([
    ('opinions', OpinionExtractor()),
    ('union',
     FeatureUnion(
         transformer_list=[
#              ('word_tokens',
#               Pipeline([('selector', ItemSelector(key='tokens')),
#                         ('tfidf', TfidfVectorizer(stop_words="english")),
#                         ('best', SelectKBest(chi2))])),
             ('pos_tokens',
              Pipeline([
                  ('selector', ItemSelector(key='pos')), 
                  ('tfidf', TfidfVectorizer())
              ])),
#              ('opinion_stats',
#               Pipeline([('selector',ItemSelector(key='sent_seg_opinion')),
#                         ('stat', OpinionStats()), ('vect', DictVectorizer())])),
             ('sentiments',
              Pipeline([('selection', ItemSelector(key='opinion')),
                  ('sentiment', SentimentExtractor()), ('vect', DictVectorizer())
              ]))
         ],
         transformer_weights={
#              'word_tokens': 0.0,
             'pos_tokens': 1.0,
#              'opinion_stats': 1.0,
             'sentiments' : 1.0
         },
     )),
    ('clf', svm.SVC()),
], memory = memory)

In [50]:
opinions = raw_data_pd["sentence"].tolist()
labels = raw_data_pd["class"].tolist()

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 5)
accuracy = []
models = []

for train_idx, test_idx in kf.split(opinions, labels):
    X_train, X_test = [opinions[i]
                       for i in train_idx], [opinions[i] for i in test_idx]
    y_train, y_test = [labels[i]
                       for i in train_idx], [labels[i] for i in test_idx]
    parameters = {
#         'union__word_tokens__tfidf__ngram_range': ((2,2),),
#         'union__word_tokens__tfidf__max_df' : (1.0,),
#         'union__word_tokens__tfidf__min_df' : (0.0,),
#         'union__word_tokens__tfidf__use_idf': (True,),
#         'union__word_tokens__tfidf__sublinear_tf': (True,),
#         'union__word_tokens__best__k': (5, 10),
        'union__pos_tokens__tfidf__ngram_range': ((1,3),),
        'union__pos_tokens__tfidf__max_df' : (1.0,),
        'union__pos_tokens__tfidf__min_df' : (0.0,),
        'union__pos_tokens__tfidf__use_idf': (True,),
        'union__pos_tokens__tfidf__sublinear_tf': (True,)
    }
    gs_classifier = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring = 'accuracy')
    gs_classifier = gs_classifier.fit(X_train, y_train)
    prediction = gs_classifier.predict(X_test)
    models.append(gs_classifier)
    accuracy.append(np.sum(y_test == prediction) * 1.0 / len(y_test))
print np.mean(accuracy)
rmtree(cachedir)

0.52342192691


In [45]:
accuracy, np.mean(accuracy)

([0.58139534883720934,
  0.44186046511627908,
  0.53488372093023251,
  0.52325581395348841,
  0.5357142857142857],
 0.52342192691029898)

In [49]:
max_df = []
min_df = []
for i, model in enumerate(models):
    print model.best_score_, accuracy[i]
    for param_name in sorted(parameters.keys()):
        print "%s: %r" % (param_name, model.best_params_[param_name])
print np.mean(max_df)
print np.mean(min_df)

0.479532163743 0.581395348837
union__pos_tokens__tfidf__max_df: 1.0
union__pos_tokens__tfidf__min_df: 0.0
union__pos_tokens__tfidf__ngram_range: (1, 2)
union__pos_tokens__tfidf__sublinear_tf: True
union__pos_tokens__tfidf__use_idf: True
0.520467836257 0.441860465116
union__pos_tokens__tfidf__max_df: 1.0
union__pos_tokens__tfidf__min_df: 0.0
union__pos_tokens__tfidf__ngram_range: (1, 2)
union__pos_tokens__tfidf__sublinear_tf: True
union__pos_tokens__tfidf__use_idf: True
0.570175438596 0.53488372093
union__pos_tokens__tfidf__max_df: 1.0
union__pos_tokens__tfidf__min_df: 0.0
union__pos_tokens__tfidf__ngram_range: (1, 2)
union__pos_tokens__tfidf__sublinear_tf: True
union__pos_tokens__tfidf__use_idf: True
0.53216374269 0.523255813953
union__pos_tokens__tfidf__max_df: 1.0
union__pos_tokens__tfidf__min_df: 0.0
union__pos_tokens__tfidf__ngram_range: (1, 2)
union__pos_tokens__tfidf__sublinear_tf: True
union__pos_tokens__tfidf__use_idf: True
0.517441860465 0.535714285714
union__pos_tokens__tfidf