In [1]:
import pandas as pd
import re
import os
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
import geopandas as gpd

In [2]:
wd = '/Users/kevin/Dropbox/legal_censorship/'

In [3]:
# Converting xlsx files to csv so that it speeds up doc reading
# for i in range(1,5):
#     df = pd.read_excel(wd+'Data/civil_random_sample/civil_random_sample_p'+str(i)+'.xlsx')
#     print(i, 'load complete')
#     df = df[['案件名称', '法院名称', '审理法院', '文书ID', '文书类型', '案号', '案件类型', '审判程序', '裁判日期', '发布日期', '全文', '公开类型']]
#     df.to_csv(wd+'Data/civil_random_sample/civil_random_sample_p'+str(i)+'.csv', index=False)
#     del df

In [4]:
df_uncensor = pd.read_csv(wd+'Data/master_uncensor.csv')

# Research question: are censored documents differentially ...
    # (0) dealing with firms?
    # (1) dealing with cases where local litigants are involved?
    # (2) dealing with cases where local firm defendants are winning in a cross-regional lawsuit?
    # (3) coming from higher-level courts?

# Let's first focus on the random sample (as our control group), and for now we're focusing on the lawsuits where we can properly define both parties of the litigants.
df = df_uncensor[df_uncensor.type_原被告 == '正常']

## Q0:
print(df.原告firm.mean()) # 0.3337
print(df.被告firm.mean()) # 0.3298
## Q1:
print((df.原告loc_mode.astype(float) == df.court_code).mean()) # 0.3440
print((df.被告loc_mode.astype(float) == df.court_code).mean()) # 0.2985
print(((df.被告loc_mode.astype(float) == df.court_code) & (df.原告loc_mode.astype(float) == df.court_code)).mean()) # 0.1993
## Q2:
print(df[(df.原告loc_mode.astype(float) == df.court_code) & (df.被告loc_mode.astype(float) != df.court_code) & (df.原告firm==True) & (df.被告firm==True)].plaintiff_win.mean()) # 0.5132
print(df[(df.原告loc_mode.astype(float) != df.court_code) & (df.被告loc_mode.astype(float) == df.court_code) & (df.原告firm==True) & (df.被告firm==True)].plaintiff_win.mean()) # 0.4313
## Q3:
print(df.county_level.mean()) # 0.8607

  exec(code_obj, self.user_global_ns, self.user_ns)


0.33371490009098287
0.32978787592648967
0.34355939061584395
0.29681416755780343
0.1985352745286463
0.5132606697678355
0.4292914052229772
0.8606918857522313


In [15]:
print(df[(df.原告loc_mode.astype(float) == df.court_code) & (df.被告loc_mode.astype(float) != df.court_code) & (df.原告firm==True) & (df.被告firm==True)].plaintiff_win.mean()) # 0.4417
print(df[(df.原告loc_mode.astype(float) != df.court_code) & (df.被告loc_mode.astype(float) == df.court_code) & (df.原告firm==True) & (df.被告firm==True)].plaintiff_win.mean()) # 0.4135

0.4417269535871526
0.41349592005134317


In [5]:
# A few other stylized facts:
# 0) 隐去被告/原告信息的案件
print((df_uncensor.type_原被告 == '正常').mean())
# 1) 审判程序
print(df_uncensor.审判程序.value_counts()[:5])
print((df_uncensor.审判程序 == '民事一审').mean())
# 2) 文书类型
print(df_uncensor.文书类型.value_counts()[:5])
# 3) 案由:: tempreason.shape == (3648145,)
df_uncensor['案由'] = df_uncensor.案由.apply(literal_eval)
tempreason = pd.Series([element for list_ in df_uncensor['案由'].values for element in list_])
print(tempreason.value_counts()[:5])

# Replicating the same items on df (dropping observations with un-identified names)
# 1) 审判程序
print(df.审判程序.value_counts()[:5])
print((df.审判程序 == '民事一审').mean())
# 2) 文书类型
print(df.文书类型.value_counts()[:5])
# 3) 案由:: tempreason.shape == (3648145,)
df['案由'] = df.案由.apply(literal_eval)
tempreason2 = pd.Series([element for list_ in df['案由'].values for element in list_])
print(tempreason2.value_counts()[:5])

0.7391678034851907
民事一审      3294499
民事二审       371873
特别程序        97008
民事审判监督      75303
其他          46378
Name: 审判程序, dtype: int64
0.8301250441582815
1.0     1752334
2.0     1472127
3.0      629191
10.0      62316
5.0       23185
Name: 文书类型, dtype: int64
民间借贷纠纷         579259
离婚纠纷           321301
买卖合同纠纷         292872
机动车交通事故责任纠纷    242398
金融借款合同纠纷       233351
dtype: int64
民事一审        2395878
民事二审         340301
民事审判监督        70978
其他            41215
非诉财产保全审查      26871
Name: 审判程序, dtype: int64
0.8167248959355641
1.0    1610761
2.0    1274955
3.0      12010
5.0      10834
9.0       9820
Name: 文书类型, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


民间借贷纠纷         444953
买卖合同纠纷         223979
金融借款合同纠纷       200604
机动车交通事故责任纠纷    179629
物业服务合同纠纷       140781
dtype: int64


In [6]:
# # Merge judge data
# df_judge = pd.read_excel(wd+'Data/civil_docx2019_only_cleaning/civil_docx2019_only_judges_1.xlsx')
# for i in [2, 3]:
#     df_temp = pd.read_excel(wd+'Data/civil_docx2019_only_cleaning/civil_docx2019_only_judges_'+str(i)+'.xlsx')
#     df_judge = pd.concat([df_judge, df_temp])
#     del df_temp
# print(df_judge.columns)
# df_judge.sample(10)

In [8]:
# # There's no need to re-run this chunk for replication purposes, because this output is unambiguous. What we do here is to create a court to county mapping.
# """
# df_uncensor = df_otherattr[~df_otherattr.法院名称.isna()]
# # Look for the county names
# county_code = pd.read_excel(wd+'Data/shapefiles/China County Index Code.xls')
# county_code = county_code[(~county_code.prefecture.isna()) & (county_code.dcode % 100 != 0)]
# county_code['prefecture'] = county_code.apply(lambda x: x['city']+x['prefecture'] if x['prefecture'].endswith('区') else x['prefecture'], axis=1)
# cty_lst = county_code.prefecture.unique()
# def find_cty(court):
#     for cty in cty_lst:
#         if court.find(cty) != -1:
#             return cty
#     return np.nan

# df_otherattr['county'] = df_otherattr.法院名称.apply(find_cty)

# china = gpd.read_file(wd+"Data/shapefiles/chn_admbnda_adm2_ocha_2020.shp", encoding='utf-8')
# def shorten_names(x):
#     x = x.replace('市', '').replace('自治州', '').replace('盟', '').replace('自治县', '').replace('地区', '')
#     temp = re.match(r'(.*)\[(.*)\]', x)
#     temp2 = re.match(r'(.*)\〔(.*)\〕', x)
#     if temp:
#         return temp.group(1)
#     elif temp2:
#         return temp2.group(1)
#     else:
#         return x

# # Look for the prefecture names
# china['ADM2_ZH'] = china['ADM2_ZH'].apply(shorten_names)
# cty_lst = china['ADM2_ZH'].unique()
# def find_cty(court):
#     for cty in cty_lst:
#         if court.find(cty) != -1:
#             return cty
#     return np.nan

# df_otherattr['pref'] = df_otherattr.法院名称.apply(find_cty)
# # Map county names and prefecture names to their corresponding county / prefecture codes
# china['ADM2_PCODE'] = china['ADM2_PCODE'].apply(lambda x: int(x[2:]))
# county_to_code = dict(county_code[['prefecture', 'dcode']].values)
# pref_to_code = dict(china[['ADM2_ZH', 'ADM2_PCODE']].values)

# pref_to_code[np.nan] = np.nan
# df_otherattr['code'] = df_otherattr.apply(lambda x: county_to_code[x['county']] if str(x['county'])!='nan' else pref_to_code[x['pref']], axis=1)
# df_otherattr.code.isna().mean() # about 2.8% of missing values

# df_otherattr[['法院名称', 'code']].drop_duplicates().to_csv(wd+'Data/court_to_code.csv', index=False, encoding='utf-8')
# """

In [14]:
# Articles being censored -- takes 9 minutes to load
df_censor = pd.read_csv(wd+'Data/master.csv')
print(df_censor.columns)

df_censor_rescrape = pd.read_csv('/Users/kevin/Dropbox/legal_censorship/Data/rescraping/output/rescraping_cases_100k-result.csv', header=None)
# 8.6 % of the master.csv is censored
df_censor = df_censor[df_censor.案号.isin(df_censor_rescrape[df_censor_rescrape[4]=='未查询到'][2])]
df_censor.columns

df_censor['原告loc'] = df_censor['原告loc'].apply(literal_eval)
df_censor['被告loc'] = df_censor['被告loc'].apply(literal_eval)

# 1 -- Add var: firm dummies
df_censor['原告firm'] = df_censor['原告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['被告firm'] = df_censor['被告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['原告firm'] = df_censor['原告firm'].apply(any)
df_censor['被告firm'] = df_censor['被告firm'].apply(any)

court_to_code = pd.read_csv(wd+'Data/court_to_code.csv')
court_to_code = dict(court_to_code.values)
# 2 -- Add var: court location GB code
df_censor['court_code'] = df_censor['法院名称'].apply(lambda x: court_to_code[x] if x in court_to_code else np.nan)

# 3 -- Where are the litigants?
df_censor['原告loc_mode'] = df_censor.原告loc.apply(lambda lst: max(set([x[2] for x in lst]), key=[x[2] for x in lst].count) if len(lst) > 0 else np.nan)
df_censor['被告loc_mode'] = df_censor.被告loc.apply(lambda lst: max(set([x[2] for x in lst]), key=[x[2] for x in lst].count) if len(lst) > 0 else np.nan)

# 4 -- what kind of court is it?
df_censor['county_level'] = df_censor['court_code'].apply(lambda x: x % 100 !=0)
df_censor['county_level'] = df_censor.apply(lambda x: True if x['法院名称'].find('区')!=-1 else x['county_level'], axis=1)

# 5 -- who is the winner?
df_censor['plaintiff_win'] = df_censor['被告pay'] > df_censor['原告pay']

df2 = df_censor[df_censor.type_原被告 == '正常']

# Research question: are censored documents differentially ...
    # (0) dealing with firms?
    # (1) dealing with cases where local litigants are involved?
    # (2) dealing with cases where local firm defendants are winning in a cross-regional lawsuit?
    # (3) coming from higher-level courts?

# Note: the number before the "vs." is the one for the random sample, the number after the "vs." is the one for the censored sample
## Q0:
print(df2.原告firm.mean()) # 0.5549 vs. 0.3449
print(df2.被告firm.mean()) # 0.6334 vs. 0.3225
## Q1:
print((df2.原告loc_mode.astype(float) == df2.court_code).mean()) # 0.3116 vs. 0.3526
print((df2.被告loc_mode.astype(float) == df2.court_code).mean()) # 0.2082 vs. 0.2727
print(((df2.被告loc_mode.astype(float) == df2.court_code) & (df2.原告loc_mode.astype(float) == df2.court_code)).mean()) # 0.0845 vs. 0.1796
## Q2:
print(df2[(df2.原告loc_mode.astype(float) == df2.court_code) & (df2.被告loc_mode.astype(float) != df2.court_code) & (df2.原告firm==True) & (df2.被告firm==True)].plaintiff_win.mean()) # 0.6776 vs. 0.5493
print(df2[(df2.原告loc_mode.astype(float) != df2.court_code) & (df2.被告loc_mode.astype(float) == df2.court_code) & (df2.原告firm==True) & (df2.被告firm==True)].plaintiff_win.mean())
print(df2[(df2.原告loc_mode.astype(float) != df2.court_code) & (df2.被告loc_mode.astype(float) != df2.court_code)].plaintiff_win.mean()) # 0.7747 vs. 0.5616
## Q3:
print(df2.county_level.mean()) # 0.8660 vs. 0.8857

  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['文书ID', '案件名称', '案号', '审判程序', '裁判日期', '发布日期', '法院名称', '案件类型', '公开类型',
       '文书类型', '案由', 'docid', 'fname', 'county', 'pref', 'code', 'prefcode',
       '原告loc', '被告loc', 'type_原被告', '上诉', '原告pay', '被告pay', '类型_pay', '案件受理费',
       '受理费合计', '受理费减半', '受理费免收', '起诉', '申请', '立案', '受理', '撤诉', '开庭',
       'court_date', 'plantifflose', 'winner', 'loser', 'local_winner',
       'local_loser'],
      dtype='object')
0.5549155520843503
0.633408181196915
0.31162745289465976
0.20823977350385628
0.08454554329786196
0.9147982062780269
0.872791519434629
0.7747233748271093
0.8660548667382603


In [11]:
df2 = df_censor[df_censor.type_原被告 == '正常']


nan
nan


Unnamed: 0,文书ID,案件名称,案号,审判程序,裁判日期,发布日期,法院名称,案件类型,公开类型,文书类型,...,loser,local_winner,local_loser,原告firm,被告firm,court_code,原告loc_mode,被告loc_mode,county_level,plaintiff_win


In [None]:
# Takeaway: the most shocking difference is the plaintiff win rate for local defendants and non-local plaintiffs.
# We see that censored lawsuits are featured with a significantly higher non-local plaintiff win rate, which suggests some incentive to protect the local business.

In [7]:
# A few other stylized facts:
# 0) 隐去被告/原告信息的案件
print((df_censor.type_原被告 == '正常').mean())
# 1) 审判程序
print(df_censor.审判程序.value_counts()[:5])
print((df_censor.审判程序 == '一审').mean())
# 2) 文书类型
print(df_censor.文书类型.value_counts()[:5])
# 3) 案由:: tempreason.shape == (3648145,)
print(df_censor.案由.isna().mean())
df_censor = df_censor[~df_censor.案由.isna()]
df_censor['案由'] = df_censor.案由.apply(literal_eval)
tempreason = pd.Series([element for list_ in df_censor['案由'].values for element in list_])
print(tempreason.value_counts()[:5])

# Replicating the same items on df (dropping observations with un-identified names)
# 1) 审判程序
print(df2.审判程序.value_counts()[:5])
print((df2.审判程序 == '一审').mean())
# 2) 文书类型
print(df2.文书类型.value_counts()[:5])
# 3) 案由:: tempreason.shape == (3648145,)
df2 = df2[~df2.案由.isna()]
df2['案由'] = df2.案由.apply(literal_eval)
tempreason2 = pd.Series([element for list_ in df2['案由'].values for element in list_])
print(tempreason2.value_counts()[:5])
# Export some aggregate data:
# (1) county-by-year
# (2) county
# (3) county-by-judge

0.9628689603308893
一审           9095
二审           1275
其他            150
再审             53
再审审查与审判监督       1
Name: 审判程序, dtype: int64
0.8549539387102839
Series([], Name: 文书类型, dtype: int64)
1.0


  del sys.path[0]


Series([], dtype: int64)
一审           8728
二审           1250
其他            150
再审             52
再审审查与审判监督       1
Name: 审判程序, dtype: int64
0.8520941130528166
Series([], Name: 文书类型, dtype: int64)
Series([], dtype: int64)


