In [7]:
import pandas as pd
import numpy as np
import spacy
import codecs
import io
import csv
import scattertext as st
import requests
import tarfile
import zipfile
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline 

From: http://web.eecs.umich.edu/~mihalcea/downloads.html

Real-life Deception

A multimodal dataset consisting of real-life deception: deceptive and truthful trial testimonies, manually transcribed and annotated. The dataset includes 121 short videos, along with their transcriptions and gesture annotations. download (June 15, 2016)

Veronica Perez-Rosas, Mohamed Abouelenien, Rada Mihalcea, Mihai Burzo, Deception Detection using Real-life Trial Data, in Proceedings of the ACM International Conference on Multimodal Interaction (ICMI 2015), Seattle, November 2015.

In [2]:
assert st.__version__ >= '0.0.2.19'

In [4]:
r = requests.get('http://web.eecs.umich.edu/~mihalcea/downloads/RealLifeDeceptionDetection.2016.zip')

In [11]:
zf = zipfile.ZipFile(io.BytesIO(r.content))

In [67]:
annotations_df = pd.read_csv(zf.open('Real-life_Deception_Detection_2016/Annotation/All_Gestures_Deceptive and Truthful.csv'))
annotations_df['id'] = annotations_df['id'].apply(lambda x: x.split('.')[0])

In [68]:
df = pd.DataFrame([{'filename': fn, 
                    'id': fn.split('/')[-1].split('.')[0], 
                    'text': zf.open(fn).read()} 
                   for fn in zf.namelist() if fn.endswith('.txt') and not fn.startswith('__') and not fn.endswith('README.txt')])

In [70]:
df['category'] = df.filename.apply(lambda x: x.split('/')[-1].split('_')[1])

In [71]:
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x.decode('utf8')))

In [44]:
corpus = st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse').build().get_unigram_corpus()

In [97]:
priors = st.PriorFactory(corpus).use_general_term_frequencies().get_priors()
scorer = st.LogOddsRatioInformativeDirichletPrior(priors, 
                                                  df.parse.apply(lambda x: sum([len(s) for s in x.sents])).mean(), 
                                                  'word')

In [98]:
html = st.produce_frequency_explorer(corpus, 'truth', term_scorer = scorer, grey_threshold=1.96, censor_points=True)

In [99]:
file_name = 'real_deception.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)

In [72]:
full_df = pd.merge(df, annotations_df, on='id')

In [75]:
gestures = [c for c in annotations_df.columns if c not in('id', 'class')]

In [79]:
full_df

Unnamed: 0,filename,id,text,category,parse,OtherGestures,Smile,Laugh,Scowl,otherEyebrowMovement,...,forwardHead,downRHead,singleHand,bothHands,otherHandM,complexHandM,sidewaysHand,downHands,upHands,class
0,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_001,b'No sir I did not. I absolutely did not. No s...,lie,No sir I did not. I absolutely did not. No sir...,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,deceptive
1,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_002,"b""... and she approached me, and at that time ...",lie,"... and she approached me, and at that time th...",1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,deceptive
2,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_003,"b'No sir I was not, not at all.'",lie,"No sir I was not, not at all.",1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,deceptive
3,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_004,"b""He had told me that he had had a dream that,...",lie,"He had told me that he had had a dream that, a...",1,0,0,0,1,...,0,1,0,0,1,0,0,0,0,deceptive
4,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_005,"b""And he told me that, ammm \xe2\x80\xa6 he wa...",lie,"And he told me that, ammm … he was trying to f...",1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,deceptive
5,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_006,"b""No, I didn't. The last time I saw her was wh...",lie,"No, I didn't. The last time I saw her was when...",1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,deceptive
6,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_007,"b""We had some drinks at the bar, maybe one ......",lie,"We had some drinks at the bar, maybe one ... t...",1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,deceptive
7,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_008,"b""Ahhh ... I didn't go to the dance club as a ...",lie,Ahhh ... I didn't go to the dance club as a dr...,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,deceptive
8,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_009,"b""Ahhh ... When we got to the hotel, I remembe...",lie,"Ahhh ... When we got to the hotel, I remember,...",1,0,0,0,1,...,1,0,1,0,0,1,0,0,0,deceptive
9,Real-life_Deception_Detection_2016/Transcripti...,trial_lie_010,"b""I have no idea. Uh, I was at work so I was p...",lie,"I have no idea. Uh, I was at work so I was pre...",0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,deceptive


In [83]:
gesture_d = {}
for gesture in gestures:
    full_df[gesture] = full_df[gesture].astype(str)
    if len(full_df[gesture].value_counts()) < 2: continue
    gesture_corpus = st.CorpusFromParsedDocuments(full_df, category_col=gesture, parsed_col='parse').build()
    gdf = gesture_corpus.get_term_freq_df()
    gdf['score'] = gesture_corpus.get_scaled_f_scores('1')
    gesture_d[gesture] = gdf.sort_values(by='score').index[:10]

In [86]:
pd.DataFrame(gesture_d).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Close-BE,you,so,they,what,have,at,were,don,don t,didn
Close-R,and he,that s,but,said,if,it s,are,i had,i didn,on the
Frown,then,so i,and then,it was,didn,didn t,so,we,really,in the
Laugh,that,of,t,in,um,me,you,so,just,had
OtherEyeMovements,yes,two,she was,years,through,um i,um,from,i did,she
OtherGestures,you,work,you know,like that,going to,i did,would,had a,they were,who
Raise,me,to the,his,even,get,them,never,it s,you,they were
Scowl,her,it was,i was,i didn,time,she,and he,his,in,i don
SideTurn,he,we,not,him,as,don t,don,he was,did,her
Smile,out,to the,for,uh,were,they,because,been,get,back


In [111]:
r = requests.get('http://web.eecs.umich.edu/~mihalcea/downloads/openDeception.2015.tar.gz')
tarball = tarfile.open(mode="r:gz", fileobj = io.BytesIO(r.content))
# work with oddly formatted csv
raw = tarball.extractfile('OpenDeception/7Truth7LiesDataset.csv')
data = []
for i, line in enumerate(raw.read().decode('utf8').split('\n')[1:]):
    row = list(csv.reader(io.StringIO(line)))
    if(len(row)) == 0: continue
    row = row[0]
    d = {}
    d['id'] = row[0]
    d['_gender'] = row[1]
    d['age'] = row[2]
    nexti = 4
    if row[3][0] == "'":
        edu = []
        for r in row[3:]:
            edu.append(r)
            if r[-1] == "'": break
            nexti += 1
        d['education'] = ','.join(edu)[1:-1]
    else:
        d['education'] = ','.join(row[3])
    d['country'] = row[nexti]
    d['text'] = '\n'.join(row[nexti+1:-1])
    d['class'] = row[-1]
    data.append(d)
artificial_df = pd.DataFrame(data)
artificial_df['countryfmt'] = artificial_df.country.apply(lambda x: x.lower().replace('.','').replace(' ','').replace("'",''))
us_artificial_df=artificial_df[artificial_df.countryfmt.isin(['usa', 'unitedstates', 'unitedstatesofamerica', 'us', 'america'])].dropna().assign(real='artificial')
us_artificial_df['parse'] = us_artificial_df.text.apply(lambda x: st.whitespace_nlp_with_sentences(codecs.escape_decode(bytes(x, "utf-8"))[0].decode("utf-8")))



In [143]:
joint_df = pd.concat([full_df.assign(real='courtroom'),  us_artificial_df.assign(real='mturk')])

In [144]:
joint_df.to_csv('joint_deception.csv.gz', compression='gzip', index=False)

In [145]:
joint_df['class'] = joint_df['class'].replace({'lie': 'deceptive', 'truth': 'truthful'})
joint_df['category'] = joint_df['class'] + ' ' + joint_df['real']

In [146]:
joint_df.category.value_counts()

deceptive mturk        2569
truthful mturk         2569
deceptive courtroom      61
truthful courtroom       60
Name: category, dtype: int64

In [148]:
four_square_corpus = st.CorpusFromParsedDocuments(joint_df, category_col='category', parsed_col='parse').build().get_unigram_corpus()

In [154]:
four_square = st.FourSquare(
    four_square_corpus,
    category_a_list=['deceptive mturk'],
    category_b_list=['deceptive courtroom'],
    not_category_a_list=['truthful courtroom'],
    not_category_b_list=['truthful mturk'],
    scorer=st.RankDifference(),
    labels={'a': '',
            'b': '',
            'not_a_and_not_b': 'Truth',
            'a_and_b': 'Lie',
            'a_and_not_b': 'MTurk',
            'b_and_not_a': 'Court Room',
            'not_a': '',
            'not_b': '',
            }
)

In [155]:
display_df = four_square_corpus.get_df()
meta = display_df['category']
html = st.produce_four_square_explorer(four_square=four_square,
                                       x_label='Artificial-Real',
                                       y_label='Truth-Lie',
                                       use_full_doc=True,
                                       minimum_term_frequency=2,
                                       metadata=meta, censor_points=False)

In [156]:
file_name = 'real_vs_fake_lie_vs_truth_four_square.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=700)