In [1]:
import pandas as pd
import numpy as np
import spacy
import codecs
import io
import csv
import scattertext as st
import requests
import tarfile
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
assert st.__version__ >= '0.0.2.19'

From: http://web.eecs.umich.edu/~mihalcea/downloads.html

Open-Domain Deception
This is a crowdsourced deception dataset consisting of short open domain truths and lies from 512 users. Seven lies and seven truths are provided for each user. The dataset also includes user's demographic information, such as gender, age, country of origin, and education level. download (August 27, 2015)

Veronica Perez-Rosas and Rada Mihalcea, Experiments in Open Domain Deception Detection, in Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2015), Lisbon, Portugal, September 2015.


In [3]:
r = requests.get('http://web.eecs.umich.edu/~mihalcea/downloads/openDeception.2015.tar.gz')
tarball = tarfile.open(mode="r:gz", fileobj = io.BytesIO(r.content))

In [42]:
# work with oddly formatted csv
raw = tarball.extractfile('OpenDeception/7Truth7LiesDataset.csv')
data = []
for i, line in enumerate(raw.read().decode('utf8').split('\n')[1:]):
    row = list(csv.reader(io.StringIO(line)))
    if(len(row)) == 0: continue
    row = row[0]
    d = {}
    d['id'] = row[0]
    d['_gender'] = row[1]
    d['age'] = row[2]
    nexti = 4
    if row[3][0] == "'":
        edu = []
        for r in row[3:]:
            edu.append(r)
            if r[-1] == "'": break
            nexti += 1
        d['education'] = ','.join(edu)[1:-1]
    else:
        d['education'] = ','.join(row[3])
    d['country'] = row[nexti]
    d['text'] = '\n'.join(row[nexti+1:-1])
    d['class'] = row[-1]
    data.append(d)
df = pd.DataFrame(data)

In [43]:
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(codecs.escape_decode(bytes(x, "utf-8"))[0].decode("utf-8")))

  if __name__ == '__main__':


In [6]:
corpus = st.CorpusFromParsedDocuments(df, category_col='class', parsed_col='parse').build()

In [23]:
html = st.produce_frequency_explorer(corpus, 'lie', term_scorer = st.LogOddsRatioUninformativeDirichletPrior(), grey_threshold=10, censor_points=False)

In [24]:
file_name = 'lie_vs_truth.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [67]:
df['countryfmt'] = df.country.apply(lambda x: x.lower().replace('.','').replace(' ','').replace("'",''))
us_df=df[df.countryfmt.isin(['usa', 'unitedstates', 'unitedstatesofamerica', 'us', 'america', 'canada', 'uk', 'unitedkingdom', 'england'])]
len(us_df.parse), len(us_df.parse.dropna())

(5306, 5306)

In [68]:
us_df['category'] = us_df['_gender'] + ' ' + us_df['class']
four_square_corpus = st.CorpusFromParsedDocuments(us_df, category_col='category', parsed_col='parse').build().get_unigram_corpus()

In [69]:
four_square = st.FourSquare(
    four_square_corpus,
    category_a_list=['Female truth'],
    category_b_list=['Female lie'],
    not_category_a_list=['Male lie'],
    not_category_b_list=['Male truth'],
    scorer=st.RankDifference(),
    labels={'a': '',
            'b': '',
            'not_a_and_not_b': 'Male',
            'a_and_b': 'Female',
            'a_and_not_b': 'Truth',
            'b_and_not_a': 'Lie',
            'not_a': '',
            'not_b': '',
            }
)

In [80]:
display_df = four_square_corpus.get_df()
meta = display_df['id'] + ', ' + display_df['country'] + ', Age: ' + display_df['age']
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Truth-Lie',
                                       y_label='Male-Female',
                                       use_full_doc=True,
                                       minimum_term_frequency=2,
                                       metadata=meta)

In [81]:
file_name = 'lie_vs_truth_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)