## EUR-LEX Case Law + Plotly

This notebook explores descriptive statistics about EUR-lEX case law database

In [None]:
#!pip install plotly

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.tools as pt
import plotly.plotly as py
import plotly.graph_objs as go
%matplotlib inline

In [None]:
pt.set_credentials_file(username="pedrovhserrano", api_key="wMdAAvQ8IGQa0Nz1FawN")
metadata = pd.read_csv('https://s3.eu-central-1.amazonaws.com/maastrichtuniversity-ids-open/cases_metadata.csv')
countries = pd.read_csv('https://s3.eu-central-1.amazonaws.com/maastrichtuniversity-ids-open/countries.csv')
subjects = pd.read_csv('https://s3.eu-central-1.amazonaws.com/maastrichtuniversity-ids-open/subjects.csv')
citations = pd.read_csv('https://s3.eu-central-1.amazonaws.com/maastrichtuniversity-ids-open/citations.csv')

### How does the metadata from EUR-LEX judgements and orders looks like?

In [None]:
#

Metadata description:
    - Source: CELEX Identifier (unique)
    - Case Label: Case identifier, may contain joint cases
    - ECLI: Unique identifier
    - Case type: Specifies wether if the case is a Judgement or an Order)
    - Judge: Name of the judge related with the case
    - Advocate: Name of the advocate related with the case
    - Country: Country or countries related with the case
    - Country-Chamber: Concatenation of Country and Chamber
    - Chamber: Chamber related with the case
    - Main Subject: Number of subjects this case is about (detail on subject table)
    - Lodge Date: Registered date of the lodge
    - Document Date: Registered date of the document
    - Case Time: Time elapsed between the lodge date and the document date
    - N Countries: Number of countries registred in the case
    - Joined Cases: Indicates wether if the case is joined or not

### How many cases are in the database? 

In [None]:
print('Unique CELEX number {}\nUnique ECLI ID {}\nUnique Case Label {}'.format(
len(metadata['source'].unique()),len(metadata['ecli'].unique()),len(metadata['case_label'].unique())))


### How the judgements looks over time?

In [None]:
metadata.groupby('case_type').count()['source']

In [None]:
judgements = metadata[metadata['case_type'] == 'Judgement'].groupby('year_document').count()['source']
full_years = list(judgements.index)

In [None]:
trace = go.Scatter(x = full_years, y = judgements, mode = 'lines+markers', name = 'Judgements')

In [None]:
py.iplot([trace], filename='Time Series Relative Cases')

## How about the orders?

## How many different case topics

In [None]:
#count subjects

In [None]:
subjects

In [None]:
df_sm = subjects.merge(metadata, on='source', how='left')
top_list = list(subjects.groupby('subject').count()['source'].sort_values(ascending=False).head(10).index)
df_sm_filter = df_sm[(df_sm['year_document'] >= 1990) & (df_sm['year_document'] < 2018)]
years = sorted([int(i) for i in df_sm_filter['year_document'].unique()])
plot_traces = []
for i in range(len(top_list)):
    series = df_sm_filter[df_sm_filter['subject'] == top_list[i]].groupby('year_document').count()['source']
    trace = go.Scatter(x = years, y = series, mode = 'lines+markers', name = top_list[i])
    plot_traces.append(trace)

In [None]:
py.iplot(plot_traces, filename='Time Series of Number of Cases by Subject')

## Case duration over time

In [None]:
pd.to_datetime(metadata['lodge_date'],format= '%Y/%m/%d').head()
metadata['lodge_date'] = pd.to_datetime(metadata['lodge_date'],format= '%Y/%m/%d')
metadata['document_date'] = pd.to_datetime(metadata['document_date'],format= '%Y/%m/%d')
metadata['case_time'] = metadata['document_date'] - metadata['lodge_date']
top_countries = list(df_cm.groupby('country').count()['source'].sort_values(ascending=False).head(10).index)

In [None]:
plot_countries = []
for i in range(len(top_countries)):
    series = df_cm[df_cm['country'] == top_countries[i]].groupby('year_document').describe()['case_time']['mean'].astype('timedelta64[D]')
    trace = go.Scatter(x = years, y = series, mode = 'lines', name = top_countries[i])
    plot_countries.append(trace)

In [None]:
py.iplot(plot_countries, filename='Time Series Average Case Duration by Country')

### Case citations

In [None]:
citations['relation'] = citations['source']+'-'+citations['target']
citations['unique'] = citations['source']+'-'+citations['target']+'-'+citations['paragraph']
citations['citation'] = citations['target']+'-'+citations['paragraph']

In [None]:
df_ = citations.groupby(['target','source']).agg('count')['citation'].reset_index()
df_target = df_.groupby('target')['citation'].agg(['count','sum']).reset_index()
df_target.columns = ['target','sources','target_paragraphs']

### What is the most cited case?

In [None]:
df_target.sort_values('sources',ascending=False).head(10)

### The case that sites more paragraphs

In [None]:
#table on relation level
df_relations = citations.groupby(['source','relation']).agg('count')['paragraph'].reset_index()

In [None]:
df_source = df_relations.groupby('source')['paragraph'].agg(['count','sum']).reset_index() 
df_source.columns = ['source','targets','source_paragraphs']

In [None]:
df_source.sort_values(['targets'], ascending=False).head(10)