# Using Scattetext to Examine Deceptive Writing
## Jason Kessler
## PSL Talk, Dec 14 2018

Link to Scattertext documentaion: https://github.com/jasonkessler/scattertext

### Data

From: http://web.eecs.umich.edu/~mihalcea/downloads.html

*Open-Domain Deception*

This is a crowdsourced deception dataset consisting of short open domain truths and lies from 512 users. Seven lies and seven truths are provided for each user. The dataset also includes user's demographic information, such as gender, age, country of origin, and education level. download (August 27, 2015)

Veronica Perez-Rosas and Rada Mihalcea, Experiments in Open Domain Deception Detection, in Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2015), Lisbon, Portugal, September 2015.

*Real-life Deception*

A multimodal dataset consisting of real-life deception: deceptive and truthful trial testimonies, manually transcribed and annotated. The dataset includes 121 short videos, along with their transcriptions and gesture annotations. download (June 15, 2016)

In [1]:
import pandas as pd
import numpy as np
import spacy
import codecs
import io
import csv
import scattertext as st
import requests
import zipfile
import tarfile
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline 

In [3]:
r = requests.get('http://web.eecs.umich.edu/~mihalcea/downloads/openDeception.2015.tar.gz')
tarball = tarfile.open(mode="r:gz", fileobj = io.BytesIO(r.content))

In [4]:
# work with oddly formatted csv
raw = tarball.extractfile('OpenDeception/7Truth7LiesDataset.csv')
data = []
for i, line in enumerate(raw.read().decode('utf8').split('\n')[1:]):
    row = list(csv.reader(io.StringIO(line)))
    if(len(row)) == 0: continue
    row = row[0]
    d = {}
    d['id'] = row[0]
    d['gender'] = row[1]
    d['age'] = row[2]
    nexti = 4
    if row[3][0] == "'":
        edu = []
        for r in row[3:]:
            edu.append(r)
            if r[-1] == "'": break
            nexti += 1
        d['education'] = ','.join(edu)[1:-1]
    else:
        d['education'] = ','.join(row[3])
    d['country'] = row[nexti]
    d['text'] = '\n'.join(row[nexti+1:-1])
    d['class'] = row[-1]
    data.append(d)
df = pd.DataFrame(data)

In [5]:
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(codecs.escape_decode(bytes(x, "utf-8"))[0].decode("utf-8")))

  """Entry point for launching an IPython kernel.


In [6]:
corpus = st.CorpusFromParsedDocuments(df, category_col='class', parsed_col='parse').build()

In [7]:
def get_metadata(corpus):
    return corpus.get_df().apply(lambda x: x['gender'] + ', ' + str(x['age']) + ', ' + x['country'] + '; ' + x['education'], axis=1)

In [8]:
corpus.recategorize(corpus.get_df()['gender']).get_categories(), corpus.get_categories()

(['Male', 'Female'], ['lie', 'truth'])

In [9]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='lie', 
                                       term_scorer=st.RankDifference(), 
                                       transform=st.Scalers.percentile_dense,
                                       metadata=get_metadata(corpus))
file_name = 'lie_vs_truth.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [10]:
gender_corpus = corpus.recategorize(df['gender'])
html = st.produce_scattertext_explorer(gender_corpus, 
                                       category='Female', 
                                       not_categories=['Male'],
                                       term_scorer=st.RankDifference(), 
                                       transform=st.Scalers.percentile_dense,
                                       metadata=get_metadata(gender_corpus))
file_name = 'female_vs_male.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [11]:
df['category'] = df['gender'] + ' ' + df['class']
four_square_corpus = corpus.recategorize(df['category']).get_unigram_corpus()

In [12]:
four_square = st.FourSquare(
    four_square_corpus,
    category_a_list=['Female truth'],
    category_b_list=['Female lie'],
    not_category_a_list=['Male lie'],
    not_category_b_list=['Male truth'],
    scorer=st.RankDifference(),
    labels={'a': 'Female-Specific Truth',
            'b': 'Female-Specific Lie',
            'not_a_and_not_b': 'Male',
            'a_and_b': 'Female',
            'a_and_not_b': 'Truth',
            'b_and_not_a': 'Lie',
            'not_a': 'Male-Specific Lie',
            'not_b': 'Male-Specific Truth',
            }
)

In [13]:
#display_df = four_square_corpus.get_df()
#meta = display_df['id'] + ', ' + display_df['country'] + ', Age: ' + display_df['age']
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Truth-Lie',
                                       y_label='Male-Female',
                                       use_full_doc=True,
                                       minimum_term_frequency=2,
                                       metadata=get_metadata(four_square_corpus))

In [14]:
file_name = 'lie_vs_truth_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [15]:
r = requests.get('http://web.eecs.umich.edu/~mihalcea/downloads/RealLifeDeceptionDetection.2016.zip')

In [16]:
zf = zipfile.ZipFile(io.BytesIO(r.content))
annotations_df = pd.read_csv(zf.open('Real-life_Deception_Detection_2016/Annotation/All_Gestures_Deceptive and Truthful.csv'))
annotations_df['id'] = annotations_df['id'].apply(lambda x: x.split('.')[0])
court_df = pd.DataFrame([{'filename': fn,  'id': fn.split('/')[-1].split('.')[0], 'text': zf.open(fn).read()} 
                         for fn in zf.namelist()if fn.endswith('.txt') and not fn.startswith('__') and not fn.endswith('README.txt')])
court_df['parse'] = court_df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x.decode('utf8')))
court_df['class'] = court_df['id'].apply(lambda x: 'lie' if '_lie_' in x else 'truth')

In [19]:
court_corpus = st.CorpusFromParsedDocuments(court_df, category_col='class', parsed_col='parse').build().get_unigram_corpus()
html = st.produce_scattertext_explorer(court_corpus, 
                                       category='truth', 
                                       not_categories=['lie'],
                                       term_scorer=st.RankDifference(), 
                                       transform=st.Scalers.percentile_dense)
file_name = 'court_scattertplot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [21]:
general_inquirer = st.FeatsFromGeneralInquirer()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [25]:
gi_court_corpus = st.CorpusFromParsedDocuments(court_df, 
                                               category_col='class', 
                                               parsed_col='parse', 
                                               feats_from_spacy_doc=general_inquirer).build().get_unigram_corpus()
html = st.produce_frequency_explorer(gi_court_corpus, 
                                       category='truth', 
                                       not_categories=['lie'],
                                       term_scorer=st.RankDifference(), 
                                       use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=general_inquirer.get_top_model_term_lists(),
                                       metadata_descriptions=general_inquirer.get_definitions(),
                                       grey_threshold=0)
file_name = 'gi_court_scattertplot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [129]:
four_square_corpus = st.CorpusFromParsedDocuments(court_df, 
                                                  category_col='class', 
                                                  parsed_col='parse', 
                                                  feats_from_spacy_doc=feat_builder).build().get_unigram_corpus()


In [117]:
mturk_df = df[df.countryfmt.isin(['usa', 'unitedstates', 'unitedstatesofamerica', 'us', 'america'])]
joint_df = pd.concat([court_df.assign(real='courtroom'),  mturk_df.assign(real='mturk')], sort=True)
joint_df['class'] = joint_df['class'].replace({'lie': 'deceptive', 'truth': 'truthful'})
joint_df = joint_df.loc[joint_df['parse'].dropna().index]
joint_df['category'] = joint_df['class'] + ' ' + joint_df['real']=

In [119]:
four_square_corpus = st.CorpusFromParsedDocuments(joint_df, category_col='category', parsed_col='parse').build().get_unigram_corpus()
four_square = st.FourSquare(
    four_square_corpus,
    category_a_list=['deceptive mturk'],
    category_b_list=['deceptive courtroom'],
    not_category_a_list=['truthful courtroom'],
    not_category_b_list=['truthful mturk'],
    scorer=st.RankDifference(),
    labels={'a': '',
            'b': '',""
            'not_a_and_not_b': 'Truth',
            'a_and_b': 'Lie',
            'a_and_not_b': 'MTurk',
            'b_and_not_a': 'Court Room',
            'not_a': '',
            'not_b': '',
            }
)
display_df = four_square_corpus.get_df()
meta = display_df['category']
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Mturk-Court',
                                       y_label='Truth-Lie',
                                       use_full_doc=True,
                                       minimum_term_frequency=2,
                                       censor_points=False)

In [120]:
file_name = 'mturk_vs_courth_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)