# Anaphora resolution and Negation Handling

In this notebook, we keep pre-processing the data. The main steps are: 

- further __normalization__ : set unique identifiers for the main characters of the trial and their prefix
- __negation handling__ : try to identify negation with a basic tagging method
- __anaphora resolution__ : manually solve anaphoras and references in the text

For more details about the functions used, refer to the https://github.com/MiriamGiuliani/OJSimpson-text-mining-trail-transcripts/blob/27c02196bc9ab702c2eac013132ff3b976689beb/py_scripts/py_functions.py file

In [1]:
import pandas as pd
import os
import glob
import numpy as np
import nltk
import re
import matplotlib
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
nltk.download('stopwords')
from py_functions import normalize_person_col, person_prefix_normalize, negation_handling, manual_anaphora_norm_dialogs, anaphora_norm_family_members

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\miria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('df_normalized.csv') # check na
df.isna().sum()

person                0
speech                8
date               1263
time                  0
number_of_words       0
dtype: int64

In [5]:
df = df[df['speech'].notna()]
df = df.reset_index(drop=True)
df.isna().sum()

person                0
speech                0
date               1263
time                  0
number_of_words       0
dtype: int64

In [6]:
# Fill Na (for some reason one date was missing)
nas = list(df[df.isnull().any(axis=1)].index)
for row in range(nas[0], nas[-1]+1):
    df.loc[row, 'date'] = 'MARCH 15, 1995'

In [7]:
# Eliminate non relevant and short strings
df.drop(df[df['speech'].str.contains('\(WITNESS COMPLIES.\)', na = False)].index, inplace = True)
df = df.reset_index(drop=True)
df.drop(df[(df["speech"].str.contains("HEARSAY|OBJECTION")==False) & (df['number_of_words']<2)].index, inplace = True)
df = df.reset_index(drop = True)
df.drop(df[(df["speech"].str.contains("RECESS")) & (df['number_of_words']<6)].index, inplace = True)
df = df.reset_index(drop = True)

In [8]:
df

Unnamed: 0,person,speech,date,time,number_of_words
0,#THE COURT:,BACK ON THE RECORD IN THE SIMPSON MATTER. MR. ...,"APRIL 13, 1995\n",9:20 A.M.,28
1,#THE COURT:,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MR. G...,"APRIL 13, 1995\n",9:20 A.M.,48
2,#MR. GOLDBERG:,"YOUR HONOR, MY UNDERSTANDING OF THE COURT'S OR...","APRIL 13, 1995\n",9:20 A.M.,65
3,#MR. SCHECK:,"WELL, YOUR HONOR, BEFORE WE LEFT WE GAVE MR. G...","APRIL 13, 1995\n",9:20 A.M.,69
4,#THE COURT:,"WELL, AS I RECALL, AS WE ENDED THE COURT DAY I...","APRIL 13, 1995\n",9:20 A.M.,18
...,...,...,...,...,...
234695,#MR. COCHRAN:,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,"SEPTEMBER 29, 1995",9:04 A.M.,24
234696,#THE COURT:,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","SEPTEMBER 29, 1995",9:04 A.M.,70
234697,#THE COURT:,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,"SEPTEMBER 29, 1995",9:04 A.M.,13
234698,#MS. CLARK:,CAN WE AT SIDEBAR WITHOUT THE COURT REPORTER?,"SEPTEMBER 29, 1995",9:04 A.M.,8


In [9]:
dream_team = ['SHAPIRO','COCHRAN','BAILEY', 'DERSHOWITZ', 'KARDASHIAN', 'HOLLEY', 'DOUGLAS', 'UELMEN', 'SCHECK', 'NEUFELD', 'BLASIER', 'THOMPSON', 'CHAPMAN', 'CAPLAN']
prosecution = ['CLARK', 'HODGMAN', 'DARDEN', 'KELBERG', 'HARMON', 'LEWIS', 'GORDON', 'BODIN', 'GOLDBERG', 'YOCHELSON', 'DARREL' , 'LYNCH']
experts = ['CLARKE', 'DR. LAKSHMANAN', 'MR. SIMS', 'MATHESON', 'MAZZOLA', 'DR. GERDES', 'DR. COTTON', 'DEEDRICK', 'BROCKBANK', 'LEE', 'DR. WEIR', 'RUBIN','BODZIAK','RIEDERS', 'SPEED','MARTZ', 'MACDONELL','BADEN']
witness_match = ['[']
court = ['COURT']

# Normalization

In [10]:
df['norm_speech']  = df['speech'].apply(lambda x: normalize_person_col(x))

In [11]:
df['person'] = df['person'].apply(lambda x: person_prefix_normalize(x))

In [12]:
df

Unnamed: 0,person,speech,date,time,number_of_words,norm_speech
0,THECOURT,BACK ON THE RECORD IN THE SIMPSON MATTER. MR. ...,"APRIL 13, 1995\n",9:20 A.M.,28,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...
1,THECOURT,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MR. G...,"APRIL 13, 1995\n",9:20 A.M.,48,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...
2,MRGOLDBERG,"YOUR HONOR, MY UNDERSTANDING OF THE COURT'S OR...","APRIL 13, 1995\n",9:20 A.M.,65,"THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER..."
3,MRSCHECK,"WELL, YOUR HONOR, BEFORE WE LEFT WE GAVE MR. G...","APRIL 13, 1995\n",9:20 A.M.,69,"WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB..."
4,THECOURT,"WELL, AS I RECALL, AS WE ENDED THE COURT DAY I...","APRIL 13, 1995\n",9:20 A.M.,18,"WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ..."
...,...,...,...,...,...,...
234695,MRCOCHRAN,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,"SEPTEMBER 29, 1995",9:04 A.M.,24,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...
234696,THECOURT,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","SEPTEMBER 29, 1995",9:04 A.M.,70,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP..."
234697,THECOURT,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,"SEPTEMBER 29, 1995",9:04 A.M.,13,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...
234698,MSCLARK,CAN WE AT SIDEBAR WITHOUT THE COURT REPORTER?,"SEPTEMBER 29, 1995",9:04 A.M.,8,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?


# Negation Handling

In [13]:
df['negation_handling']  = df['norm_speech'].apply(lambda x: negation_handling(x))

In [14]:
df

Unnamed: 0,person,speech,date,time,number_of_words,norm_speech,negation_handling
0,THECOURT,BACK ON THE RECORD IN THE SIMPSON MATTER. MR. ...,"APRIL 13, 1995\n",9:20 A.M.,28,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...
1,THECOURT,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MR. G...,"APRIL 13, 1995\n",9:20 A.M.,48,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...
2,MRGOLDBERG,"YOUR HONOR, MY UNDERSTANDING OF THE COURT'S OR...","APRIL 13, 1995\n",9:20 A.M.,65,"THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER...","THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER..."
3,MRSCHECK,"WELL, YOUR HONOR, BEFORE WE LEFT WE GAVE MR. G...","APRIL 13, 1995\n",9:20 A.M.,69,"WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB...","WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB..."
4,THECOURT,"WELL, AS I RECALL, AS WE ENDED THE COURT DAY I...","APRIL 13, 1995\n",9:20 A.M.,18,"WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ...","WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ..."
...,...,...,...,...,...,...,...
234695,MRCOCHRAN,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,"SEPTEMBER 29, 1995",9:04 A.M.,24,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...
234696,THECOURT,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","SEPTEMBER 29, 1995",9:04 A.M.,70,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP..."
234697,THECOURT,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,"SEPTEMBER 29, 1995",9:04 A.M.,13,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...
234698,MSCLARK,CAN WE AT SIDEBAR WITHOUT THE COURT REPORTER?,"SEPTEMBER 29, 1995",9:04 A.M.,8,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?


# Manual Anaphora Resolution

In [15]:
# Create new column to be filled
df['anaphora_solved'] = 0

In [16]:
# Manually normalize text based on sequence of dialogs (for more details see py_functions.py)
# This step might take a while to complete
df = manual_anaphora_norm_dialogs(df, dream_team, prosecution, court)
df

Unnamed: 0,person,speech,date,time,number_of_words,norm_speech,negation_handling,anaphora_solved
0,THECOURT,BACK ON THE RECORD IN THE SIMPSON MATTER. MR. ...,"APRIL 13, 1995\n",9:20 A.M.,28,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...
1,THECOURT,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MR. G...,"APRIL 13, 1995\n",9:20 A.M.,48,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...
2,MRGOLDBERG,"YOUR HONOR, MY UNDERSTANDING OF THE COURT'S OR...","APRIL 13, 1995\n",9:20 A.M.,65,"THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER...","THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER...","THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER..."
3,MRSCHECK,"WELL, YOUR HONOR, BEFORE WE LEFT WE GAVE MR. G...","APRIL 13, 1995\n",9:20 A.M.,69,"WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB...","WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB...","WELL, THECOURT, BEFORE MRSCHECK LEFT MRSCHECK ..."
4,THECOURT,"WELL, AS I RECALL, AS WE ENDED THE COURT DAY I...","APRIL 13, 1995\n",9:20 A.M.,18,"WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ...","WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ...","WELL, AS THECOURT RECALL, AS THECOURT ENDED TH..."
...,...,...,...,...,...,...,...,...
234695,MRCOCHRAN,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,"SEPTEMBER 29, 1995",9:04 A.M.,24,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...
234696,THECOURT,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","SEPTEMBER 29, 1995",9:04 A.M.,70,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","THECOURT'LL ISSUE A DELIBERATION SCHEDULE, AN ..."
234697,THECOURT,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,"SEPTEMBER 29, 1995",9:04 A.M.,13,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,SO MSCLARK SHOULD ANTICIPATE THAT. ANYTHING EL...
234698,MSCLARK,CAN WE AT SIDEBAR WITHOUT THE COURT REPORTER?,"SEPTEMBER 29, 1995",9:04 A.M.,8,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?,CAN MSCLARK AT SIDEBAR WITHOUT THECOURT REPORTER?


In [17]:
# Manually normalize text based on family relationships, based on the time in which different
# witnesses have been interrogated (for more details see py_functions.py)
df = anaphora_norm_family_members(df)

In [21]:
# If there are still cases where the column "anaphora_solved" is not filled, fill it with the "negation_handling" column
an_zeros = list(df[df['anaphora_solved'] == 0].index)
df.iloc[an_zeros, 7] = df.iloc[an_zeros, 6]
df

Unnamed: 0,person,speech,date,time,number_of_words,norm_speech,negation_handling,anaphora_solved
0,THECOURT,BACK ON THE RECORD IN THE SIMPSON MATTER. MR. ...,"APRIL 13, 1995\n",9:20 A.M.,28,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...,BACK ON THE RECORD IN THE OJSIMPSON MATTER. OJ...
1,THECOURT,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MR. G...,"APRIL 13, 1995\n",9:20 A.M.,48,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...,ALL RIGHT. THE PEOPLE ARE REPRESENTED BY MRGOL...
2,MRGOLDBERG,"YOUR HONOR, MY UNDERSTANDING OF THE COURT'S OR...","APRIL 13, 1995\n",9:20 A.M.,65,"THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER...","THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER...","THECOURT, MY UNDERSTANDING OF THECOURT'S ORDER..."
3,MRSCHECK,"WELL, YOUR HONOR, BEFORE WE LEFT WE GAVE MR. G...","APRIL 13, 1995\n",9:20 A.M.,69,"WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB...","WELL, THECOURT, BEFORE WE LEFT WE GAVE MRGOLDB...","WELL, THECOURT, BEFORE MRSCHECK LEFT MRSCHECK ..."
4,THECOURT,"WELL, AS I RECALL, AS WE ENDED THE COURT DAY I...","APRIL 13, 1995\n",9:20 A.M.,18,"WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ...","WELL, AS I RECALL, AS WE ENDED THECOURT DAY I ...","WELL, AS THECOURT RECALL, AS THECOURT ENDED TH..."
...,...,...,...,...,...,...,...,...
234695,MRCOCHRAN,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,"SEPTEMBER 29, 1995",9:04 A.M.,24,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...,ONE QUESTION. IF THE VERDICT IS AT 5:00 O'CLOC...
234696,THECOURT,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","SEPTEMBER 29, 1995",9:04 A.M.,70,"I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","I'LL ISSUE A DELIBERATION SCHEDULE, AN ANTICIP...","THECOURT'LL ISSUE A DELIBERATION SCHEDULE, AN ..."
234697,THECOURT,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,"SEPTEMBER 29, 1995",9:04 A.M.,13,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,SO YOU SHOULD ANTICIPATE THAT. ANYTHING ELSE? ...,SO MSCLARK SHOULD ANTICIPATE THAT. ANYTHING EL...
234698,MSCLARK,CAN WE AT SIDEBAR WITHOUT THE COURT REPORTER?,"SEPTEMBER 29, 1995",9:04 A.M.,8,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?,CAN WE AT SIDEBAR WITHOUT THECOURT REPORTER?,CAN MSCLARK AT SIDEBAR WITHOUT THECOURT REPORTER?


In [22]:
# Write CSV
df = df.to_csv('final_data_set.csv')