In [1]:
import pandas as pd
import re
import os
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
import geopandas as gpd

In [2]:
wd = '/Users/kevin/Dropbox/legal_censorship/'

In [3]:
df_uncensor = pd.read_csv(wd+'Data/master_uncensor.csv')

# Research question: are censored documents differentially ...
    # (0) dealing with firms?
    # (1) dealing with cases where local litigants are involved?
    # (2) dealing with cases where local firm defendants are winning in a cross-regional lawsuit?
    # (3) coming from higher-level courts?

# Let's first focus on the random sample (as our control group), and for now we're focusing on the lawsuits where we can properly define both parties of the litigants.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df_uncensor.columns

Index(['docid', '案件名称', '案号', '审判程序', '裁判日期', '发布日期', '法院名称', '案件类型', '公开类型',
       '文书类型', '案由', '原告loc', '被告loc', 'type_原被告', '上诉', '原告pay', '被告pay',
       '类型_pay', '案件受理费', '受理费合计', '受理费减半', '受理费免收', '起诉', '申请', '立案', '受理',
       '撤诉', '开庭', '原告firm', '被告firm', 'court_code', '原告loc_mode',
       '被告loc_mode', 'county_level', 'plaintiff_win'],
      dtype='object')

In [7]:
df_uncensor['year'] = df_uncensor.裁判日期.apply(lambda x: x[:4])

In [8]:
df_uncensor = df_uncensor[df_uncensor.type_原被告 == '正常']
# Explode, merge with SOE data, and collapse to case level
df_uncensor['原告loc'] = df_uncensor['原告loc'].apply(literal_eval)
df_uncensor['被告loc'] = df_uncensor['被告loc'].apply(literal_eval)

In [9]:
df_uncensor['原告firm'] = df_uncensor['原告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_uncensor['被告firm'] = df_uncensor['被告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_uncensor['原告firm'] = df_uncensor['原告firm'].apply(any)
df_uncensor['被告firm'] = df_uncensor['被告firm'].apply(any)

In [10]:
# Only 10% are firm-to-firm lawsuits
df_uncensor = df_uncensor[(df_uncensor.原告firm == True) & (df_uncensor.被告firm == True)]

In [9]:
# First, let's take a few random samples to see what's the nature of firm-to-firm cases, individual-to-firm cases, and firm-to-individual cases
# 1.1 firm-to-firm lawsuits are most directly reasonable
# df_uncensor[(df_uncensor.原告firm == True) & (df_uncensor.被告firm == True)].案件名称.sample(10)
# 1.2 individual-to-firm lawsuits are most directly reasonable
# df_uncensor[(df_uncensor.原告firm == False) & (df_uncensor.被告firm == True)].案件名称.sample(10)

In [12]:
df_county = df_uncensor.groupby(['原告loc_mode', 'year']).docid.count().reset_index()
df_county.rename({'原告loc_mode': 'county', 'docid': 'plaintiff_counts'}, axis=1, inplace=True)
df_county = df_county[df_county.county != 0]

df_county2 = df_uncensor.groupby(['被告loc_mode', 'year']).docid.count().reset_index()
df_county2.rename({'被告loc_mode': 'county', 'docid': 'defendant_counts'}, axis=1, inplace=True)
df_county2 = df_county2[df_county2.county != 0]
df_county = pd.merge(df_county, df_county2, on=['county', 'year'], how='outer', validate='1:1')

df_county3 = df_uncensor.groupby(['court_code', 'year']).docid.count().reset_index()
df_county3.rename({'court_code': 'county', 'docid': 'court_counts'}, axis=1, inplace=True)
df_county3 = df_county3[df_county3.county != 0]
df_county = pd.merge(df_county, df_county3, on=['county', 'year'], how='outer', validate='1:1')

df_county['plaintiff_counts'].fillna(0, inplace=True)
df_county['defendant_counts'].fillna(0, inplace=True)
df_county['court_counts'].fillna(0, inplace=True)
df_county.to_excel(wd+'Data/control_county.xlsx',index=False)

In [3]:
# Articles being censored -- takes 9 minutes to load
df_censor = pd.read_csv(wd+'Data/master.csv')
df_censor.原告loc = df_censor.原告loc.apply(literal_eval)
df_censor.被告loc = df_censor.被告loc.apply(literal_eval)

df_censor_rescrape = pd.read_csv('/Users/kevin/Dropbox/legal_censorship/Data/rescraping/output/rescraping_cases_100k-result.csv', header=None)
# 8.6 % of the master.csv is censored
df_censor = df_censor[df_censor.案号.isin(df_censor_rescrape[df_censor_rescrape[4]=='未查询到'][2])]
df_censor.columns

  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['文书ID', '案件名称', '案号', '审判程序', '裁判日期', '发布日期', '法院名称', '案件类型', '公开类型',
       '文书类型', '案由', 'docid', 'fname', 'county', 'pref', 'code', 'prefcode',
       '原告loc', '被告loc', 'type_原被告', '上诉', '原告pay', '被告pay', '类型_pay', '案件受理费',
       '受理费合计', '受理费减半', '受理费免收', '起诉', '申请', '立案', '受理', '撤诉', '开庭',
       'court_date', 'plantifflose', 'winner', 'loser', 'local_winner',
       'local_loser'],
      dtype='object')

In [5]:
df_censor['court_date'] = df_censor.apply(lambda x: x['court_date'] if str(x['court_date'])!='nan' else x['发布日期'], axis=1)
df_censor['year'] = df_censor.court_date.apply(lambda x: x[:4])

df_censor.drop_duplicates('docid', inplace=True)
df_censor = df_censor[df_censor.type_原被告 == '正常']
# # Explode, merge with SOE data, and collapse to case level

df_censor['原告firm'] = df_censor['原告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['被告firm'] = df_censor['被告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['原告firm'] = df_censor['原告firm'].apply(any)
df_censor['被告firm'] = df_censor['被告firm'].apply(any)
df_censor = df_censor[(df_censor.原告firm == True) & (df_censor.被告firm == True)]

In [6]:
df_censor['code'] = df_censor['code'].apply(lambda x: x*10 if x < 1e5 else x)
df_censor['原告loc_mode'] = df_censor.原告loc.apply(lambda lst: max(set([x[2] for x in lst]), key=[x[2] for x in lst].count) if len(lst) > 0 else np.nan)
df_censor['被告loc_mode'] = df_censor.被告loc.apply(lambda lst: max(set([x[2] for x in lst]), key=[x[2] for x in lst].count) if len(lst) > 0 else np.nan)

In [7]:
df_county = df_censor.groupby(['原告loc_mode', 'year']).docid.count().reset_index()
df_county.rename({'原告loc_mode': 'county', 'docid': 'plaintiff_counts'}, axis=1, inplace=True)
df_county['county'] = df_county.county.astype(float)
df_county = df_county[df_county.county != 0]

df_county2 = df_censor.groupby(['被告loc_mode', 'year']).docid.count().reset_index()
df_county2.rename({'被告loc_mode': 'county', 'docid': 'defendant_counts'}, axis=1, inplace=True)
df_county2['county'] = df_county2.county.astype(float)
df_county2 = df_county2[df_county2.county != 0]
df_county = pd.merge(df_county, df_county2, on=['county','year'], how='outer', validate='1:1')

df_county3 = df_censor.groupby(['code', 'year']).docid.count().reset_index()
df_county3.rename({'code': 'county', 'docid': 'court_counts'}, axis=1, inplace=True)
df_county3 = df_county3[df_county3.county != 0]
df_county = pd.merge(df_county, df_county3, on=['county', 'year'], how='outer', validate='1:1')

df_county['plaintiff_counts'].fillna(0, inplace=True)
df_county['defendant_counts'].fillna(0, inplace=True)
df_county['court_counts'].fillna(0, inplace=True)
df_county.to_excel(wd+'Data/treat_county.xlsx',index=False)

In [14]:
# Transform the dataset to a balanced panel
df_county.set_index('county', inplace=True)
df = df_county.set_index('year', append=True).unstack().stack(dropna=False)
df['plaintiff_counts'].fillna(0, inplace=True)
df['defendant_counts'].fillna(0, inplace=True)
df['court_counts'].fillna(0, inplace=True)
df.reset_index().to_excel(wd+'Data/treat_county.xlsx',index=False)