In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import dask.dataframe as dd
import gc
from datetime import date
from IPython.display import Image

In [None]:
ja_merge_df = dd.read_csv("../input/precog-assignment/Judge_Act_merge.csv")
ja_merge_df.head()

In [None]:
case_2018 = dd.read_csv("../input/precog-assignment/cases/cases_2018.csv",usecols=["ddl_case_id","date_of_filing","date_of_decision","state_code"])
case_2018.head()

In [None]:
merged_dfd = ja_merge_df.merge(case_2018,on=["ddl_case_id"],how="inner")
merged_dfd.head()

In [None]:
merged_df = merged_dfd.dropna(subset=["date_of_decision"]).compute()
merged_dfd = merged_dfd.compute()

In [None]:
from datetime import date
def days_elapsed(date1,date2):
    d1s = date1.split("-")
    d2s = date2.split("-")
    d1s = [int(i) for i in d1s]
    d2s = [int(i) for i in d2s]
    d1 = date(d1s[0],d1s[1],d1s[2])
    d2 = date(d2s[0],d2s[1],d2s[2])
    return (d2 - d1).days

In [None]:
merged_df.drop(["ddl_case_id","ddl_judge_id"],axis=1,inplace=True)
merged_df["time_taken"] = merged_df.apply(lambda x: days_elapsed(x["date_of_filing"],x["date_of_decision"]),axis=1)
merged_df.drop(["date_of_filing","date_of_decision"],axis=1,inplace=True)

In [None]:
state_grouped = (merged_df.groupby(["state_code"])["time_taken"].mean().reset_index(drop=False))
state_grouped.head()

In [None]:
state_grouped = state_grouped.sort_values(by=["time_taken"],ascending=False)


In [None]:
state_key = pd.read_csv("../input/precog-assignment/keys/cases_state_key.csv",usecols=["year","state_code","state_name"])
state_2018 = state_key[state_key["year"]==2018].drop(["year"],axis=1)

In [None]:
state_grouped = state_grouped.merge(state_2018,on=["state_code"],how="inner")
state_grouped.head(10)

In [None]:
Image(filename='../input/precog-assignment/state_time.png') 

In [None]:
state_count = (merged_dfd.groupby(["state_code"]).size().reset_index(drop=False,name="no_of_cases"))
state_count = state_count.merge(state_2018,how="inner",on=['state_code'])
state_count = state_count.sort_values(by=["no_of_cases"],ascending=False)
state_count.head(10)

In [None]:
Image(filename='../input/precog-assignment/state_cases.png') 

In [None]:
double_df = state_count.merge(state_grouped,on=["state_code","state_name"],how="inner")
double_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(data=double_df, x='no_of_cases', y='time_taken')
plt.title(f"No of Cases vs Average Time taken per case")
plt.xlabel("No of Cases")
plt.ylabel("Time Taken per case")
          
for i in range(double_df.shape[0]):
          plt.text(double_df.no_of_cases[i], y=double_df.time_taken[i], s=double_df.state_name[i], alpha=0.8)
        
plt.axhline(y=double_df.time_taken.mean(), color='k', linestyle='-', linewidth=1)           
plt.axvline(x=double_df.no_of_cases.mean(), color='k',linestyle='-', linewidth=1) 
plt.axhline(y=0.95*double_df.time_taken.mean(), color='g', linestyle='--', linewidth=1)  
plt.axvline(x=0.95*double_df.no_of_cases.mean(), color='g',linestyle='--', linewidth=1) 

plt.show()

In [None]:
jf = pd.read_csv("../input/precog-assignment/judges_clean/judges_clean.csv",usecols=["female_judge","state_code"])
jf.head()

In [None]:
jf_count = jf.groupby(["state_code"]).size().reset_index(drop=False,name="total")
jf_count.head()

In [None]:
jf_fem = jf[jf["female_judge"] == "1 female"]
jf_fem = jf_fem.groupby(["state_code"]).size().reset_index(drop=False,name="female")
jf_fem.head()

In [None]:
jf_comb = jf_fem.merge(jf_count,on=["state_code"],how="inner")
jf_comb["fem_percentage"] = jf_comb.apply(lambda x: 100* (x["female"]/x["total"]),axis=1)
jf_comb = jf_comb.merge(state_2018,on=["state_code"],how="inner")
jf_comb = jf_comb.sort_values(by=["fem_percentage"],ascending=False)

In [None]:
fem_comb = jf_comb.merge(state_grouped,on=["state_name","state_code"],how="inner")
fem_comb.head()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot(data=fem_comb, x='fem_percentage', y='time_taken')
# plt.title(f"No of Cases vs Average Time taken per case")
# plt.xlabel("No of Cases")
# plt.ylabel("Time Taken per case")
          
for i in range(fem_comb.shape[0]):
          plt.text(fem_comb.fem_percentage[i], y=fem_comb.time_taken[i], s=fem_comb.state_name[i], alpha=0.8)
        
plt.axhline(y=fem_comb.time_taken.mean(), color='k', linestyle='-', linewidth=1)           
plt.axvline(x=40, color='k',linestyle='-', linewidth=1) 
# plt.axhline(y=0.95*double_df.time_taken.mean(), color='g', linestyle='--', linewidth=1)  
# plt.axvline(x=0.95*double_df.no_of_cases.mean(), color='g',linestyle='--', linewidth=1) 

plt.show()