In [1]:
import pandas as pd
import re
import os
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
import geopandas as gpd

In [2]:
wd = '/Users/kevin/Dropbox/legal_censorship/'

In [3]:
df_uncensor = pd.read_csv(wd+'Data/master_uncensor.csv')

# Research question: are censored documents differentially ...
    # (0) dealing with firms?
    # (1) dealing with cases where local litigants are involved?
    # (2) dealing with cases where local firm defendants are winning in a cross-regional lawsuit?
    # (3) coming from higher-level courts?

# Let's first focus on the random sample (as our control group), and for now we're focusing on the lawsuits where we can properly define both parties of the litigants.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# 6.7% of traffic accidents!
df_uncensor['traffic_accident'] = df_uncensor['案件名称'].apply(lambda x: x.find('交通事故')!=-1)
print(df_uncensor.traffic_accident.mean())
df_uncensor = df_uncensor[df_uncensor.traffic_accident == False]

0.06296655964530254


In [5]:
df_uncensor.columns

Index(['docid', '案件名称', '案号', '审判程序', '裁判日期', '发布日期', '法院名称', '案件类型', '公开类型',
       '文书类型', '案由', '原告loc', '被告loc', 'type_原被告', '上诉', '原告pay', '被告pay',
       '类型_pay', '案件受理费', '受理费合计', '受理费减半', '受理费免收', '起诉', '申请', '立案', '受理',
       '撤诉', '开庭', '原告firm', '被告firm', 'court_code', '原告loc_mode',
       '被告loc_mode', 'county_level', 'plaintiff_win', 'traffic_accident'],
      dtype='object')

In [6]:
df_uncensor = df_uncensor[df_uncensor.type_原被告 == '正常']
# Explode, merge with SOE data, and collapse to case level
df_uncensor['原告loc'] = df_uncensor['原告loc'].apply(literal_eval)
df_uncensor['被告loc'] = df_uncensor['被告loc'].apply(literal_eval)

In [7]:
df_uncensor['原告firm'] = df_uncensor['原告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_uncensor['被告firm'] = df_uncensor['被告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_uncensor['原告firm'] = df_uncensor['原告firm'].apply(any)
df_uncensor['被告firm'] = df_uncensor['被告firm'].apply(any)

In [8]:
# Only 10% are firm-to-firm lawsuits
df_uncensor[(df_uncensor.原告firm == True) & (df_uncensor.被告firm == True)].shape[0] / df_uncensor.shape[0]

0.10637608186328808

In [9]:
# First, let's take a few random samples to see what's the nature of firm-to-firm cases, individual-to-firm cases, and firm-to-individual cases
# 1.1 firm-to-firm lawsuits are most directly reasonable
# df_uncensor[(df_uncensor.原告firm == True) & (df_uncensor.被告firm == True)].案件名称.sample(10)
# 1.2 individual-to-firm lawsuits are most directly reasonable
# df_uncensor[(df_uncensor.原告firm == False) & (df_uncensor.被告firm == True)].案件名称.sample(10)

In [10]:
df_uncensor = df_uncensor[(df_uncensor.被告firm == True)]
df_soe = pd.read_stata(wd+'Data/company_data/SOE.dta')
def remove_alias(name):
    # some names are of the form XXX(曾用名:XXX), for the first iteration we drop the alias in the parathesis
    reObj = re.match(r'(.*)\((.*)\)', name)
    if reObj:
        return reObj.group(1)
    return name
df_soe['ent_name'] = df_soe['ent_name'].apply(remove_alias)
df_soe.drop_duplicates('ent_name', inplace=True)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


In [11]:
df_uncensor.drop_duplicates('docid', inplace=True)
df_temp = df_uncensor[['docid', '原告loc']].explode('原告loc')
df_temp['ent_name'] = df_temp['原告loc'].apply(lambda x: x[0] if str(x)!='nan' else np.nan)
df_temp = pd.merge(df_temp, df_soe, on='ent_name', how='left', validate='m:1')
df_temp = df_temp[['docid', 'soe', 'soe_connected', 'central_soe', 'central_soe_connected']].groupby('docid').agg(max).reset_index()
df_temp.columns = ['docid', 'soe_plaintiff', 'soe_connected_plaintiff', 'central_soe_plaintiff', 'central_soe_connected_plaintiff']
df_temp.fillna(0, inplace=True)
df_uncensor = pd.merge(df_uncensor, df_temp, on='docid', how='left', validate='1:1')

In [12]:
df_temp = df_uncensor[['docid', '被告loc']].explode('被告loc')
df_temp['ent_name'] = df_temp['被告loc'].apply(lambda x: x[0] if str(x)!='nan' else np.nan)
df_temp = pd.merge(df_temp, df_soe, on='ent_name', how='left', validate='m:1')
df_temp = df_temp[['docid', 'soe', 'soe_connected', 'central_soe', 'central_soe_connected']].groupby('docid').agg(max).reset_index()
df_temp.columns = ['docid', 'soe_defendant', 'soe_connected_defendant', 'central_soe_defendant', 'central_soe_connected_defendant']
df_temp.fillna(0, inplace=True)
df_uncensor = pd.merge(df_uncensor, df_temp, on='docid', how='left', validate='1:1')

In [13]:
# Summary statatistics for the uncensored part
for col in ['soe_plaintiff', 'soe_connected_plaintiff', 'central_soe_plaintiff', 'central_soe_connected_plaintiff', 'soe_defendant', 'soe_connected_defendant', 'central_soe_defendant', 'central_soe_connected_defendant']:
    print(col, df_uncensor[col].mean())

soe_plaintiff 0.021334758028388023
soe_connected_plaintiff 0.01391791645437479
central_soe_plaintiff 9.319201490143314e-05
central_soe_connected_plaintiff 0.00142770167440176
soe_defendant 0.03194994851946831
soe_connected_defendant 0.0343928225338459
central_soe_defendant 8.946433081291616e-05
central_soe_connected_defendant 0.006502317264676094


In [14]:
# We observe that actually quite a lot of the so-called SOE related cases are not firm-related cases, but instead traffice accidents that loops in an insurance company
# We should exclude them

In [3]:
# Articles being censored -- takes 9 minutes to load
df_censor = pd.read_csv(wd+'Data/master.csv')
del df_censor['全文']

# 6.3% of traffice accidents!
df_censor['traffic_accident'] = df_censor['案件名称'].apply(lambda x: x.find('交通事故')!=-1)
print(df_censor.traffic_accident.mean())
df_censor = df_censor[df_censor.traffic_accident == False]

  exec(code_obj, self.user_global_ns, self.user_ns)


0.06502694846337857


In [5]:
df_censor.shape

(2701168, 41)

In [14]:
df_censor[df_censor.案号 == "（2018）苏0925民初6068号"]

Unnamed: 0,文书ID,案件名称,案号,审判程序,裁判日期,发布日期,法院名称,案件类型,公开类型,文书类型,...,受理,撤诉,开庭,court_date,plantifflose,winner,loser,local_winner,local_loser,traffic_accident
104600,60c5d101c74543158fe4aa0e00c00795,何乃林与颜以旭、颜强强、颜廷玲、孙连海、盐城市龙鼎建筑装饰工程有限公司、盐城亚海商贸有限公司...,（2018）苏0925民初6068号,一审,,2019-03-12,建湖县人民法院,2,,,...,,,,,True,[],[],,,False


In [16]:
df_censor = df_censor[df_censor.type_原被告 == '正常']
# Explode, merge with SOE data, and collapse to case level
df_censor['原告loc'] = df_censor['原告loc'].apply(literal_eval)
df_censor['被告loc'] = df_censor['被告loc'].apply(literal_eval)

df_censor['原告firm'] = df_censor['原告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['被告firm'] = df_censor['被告loc'].apply(lambda x: [y[0].find('公司')!= -1 for y in x])
df_censor['原告firm'] = df_censor['原告firm'].apply(any)
df_censor['被告firm'] = df_censor['被告firm'].apply(any)
df_censor = df_censor[(df_censor.被告firm == True)]

In [17]:
df_censor.drop_duplicates('docid', inplace=True)
df_temp = df_censor[['docid', '原告loc']].explode('原告loc')
df_temp['ent_name'] = df_temp['原告loc'].apply(lambda x: x[0] if str(x)!='nan' else np.nan)
df_temp = pd.merge(df_temp, df_soe, on='ent_name', how='left', validate='m:1')
df_temp = df_temp[['docid', 'soe', 'soe_connected', 'central_soe', 'central_soe_connected']].groupby('docid').agg(max).reset_index()
df_temp.columns = ['docid', 'soe_plaintiff', 'soe_connected_plaintiff', 'central_soe_plaintiff', 'central_soe_connected_plaintiff']
df_temp.fillna(0, inplace=True)
df_censor = pd.merge(df_censor, df_temp, on='docid', how='left', validate='1:1')

df_temp = df_censor[['docid', '被告loc']].explode('被告loc')
df_temp['ent_name'] = df_temp['被告loc'].apply(lambda x: x[0] if str(x)!='nan' else np.nan)
df_temp = pd.merge(df_temp, df_soe, on='ent_name', how='left', validate='m:1')
df_temp = df_temp[['docid', 'soe', 'soe_connected', 'central_soe', 'central_soe_connected']].groupby('docid').agg(max).reset_index()
df_temp.columns = ['docid', 'soe_defendant', 'soe_connected_defendant', 'central_soe_defendant', 'central_soe_connected_defendant']
df_temp.fillna(0, inplace=True)
df_censor = pd.merge(df_censor, df_temp, on='docid', how='left', validate='1:1')

In [18]:
# Summary statatistics for the censored part
for col in ['soe_plaintiff', 'soe_connected_plaintiff', 'central_soe_plaintiff', 'central_soe_connected_plaintiff', 'soe_defendant', 'soe_connected_defendant', 'central_soe_defendant', 'central_soe_connected_defendant']:
    print(col, df_censor[col].mean())

soe_plaintiff 0.02126302756369114
soe_connected_plaintiff 0.016539691016077995
central_soe_plaintiff 0.00021790157188661397
central_soe_connected_plaintiff 0.00135387375485152
soe_defendant 0.0250378530472517
soe_connected_defendant 0.030986886471509933
central_soe_defendant 7.530421862611547e-05
central_soe_connected_defendant 0.005098255816847086


In [19]:
# Who wins?
df_censor['plaintiff_win'] = df_censor['被告pay'] / (df_censor['原告pay'] + df_censor['被告pay'])
df_uncensor['plaintiff_win'] = df_uncensor['被告pay'] / (df_uncensor['原告pay'] + df_uncensor['被告pay'])
# Who wins more?
# For SOE plaintiff, they win more in censored cases
print(df_censor[(df_censor.soe_connected_plaintiff == True)].plaintiff_win.mean())
print(df_uncensor[(df_uncensor.soe_connected_plaintiff == True)].plaintiff_win.mean())
# For SOE defendent, they loss more in censored cases
print(df_censor[(df_censor.soe_defendant == True) | (df_censor.soe_connected_defendant == True)].plaintiff_win.mean()) # Defendant win rate: 35%
print(df_uncensor[(df_uncensor.soe_defendant == True) | (df_uncensor.soe_connected_defendant == True)].plaintiff_win.mean()) # Defendant win rate: 44%

0.8028752365175116
0.6891575387750086
0.6498037322847358
0.5610744514469224


In [20]:
print(df_censor[(df_censor.soe_defendant == True) | (df_censor.soe_connected_defendant == True) | (df_censor.soe_plaintiff == False) | (df_censor.soe_connected_plaintiff == False)].plaintiff_win.mean()) # Defendant win rate: 35%
print(df_uncensor[(df_uncensor.soe_defendant == True) | (df_uncensor.soe_connected_defendant == True) | (df_uncensor.soe_plaintiff == False) | (df_uncensor.soe_connected_plaintiff == False)].plaintiff_win.mean()) # Defendant win rate: 44%

0.702190555013993
0.595670421437652


In [24]:
df_censor[(df_censor.soe_defendant == False) | (df_censor.soe_connected_defendant == False)].被告loc.sample(10)

67142     [[苏州可瑞斯特纺织品有限公司, 江苏省苏州市吴中区甪直镇凌港路66-8号, 320506]...
219474                  [[赤峰博爱皮革箱包有限公司, 内蒙古自治区赤峰市, 150400]]
385302    [[大邑显明西岭水泥有限公司, 四川省成都市大邑县晋原镇光华村（工业集中发展区）, 5101...
170395    [[魏力, 北京市海淀区, 110108], [北京葛洲坝龙湖置业有限公司, 北京市丰台区万...
349622               [[日照市岚山泉祥房地产开发有限责任公司, 日照市岚山区, 371103]]
49783     [[中国石油管道局工程有限公司（原中国石油天然气管道工程有限公司）, 河北省廊坊市广阳区广阳...
61829                      [[河南九酷网络科技有限公司, 注册地河南省, 410000]]
160159    [[泸州市花园房地产开发有限公司, 泸州市龙马大道二段30号, 510500], [泸州市花...
500154             [[亳州市丽都商贸有限责任公司, 亳州市谯城区新华路180号, 341602]]
250327                    [[乐清市保安服务公司, 乐清市城南街道悬浦村, 330382]]
Name: 被告loc, dtype: object

In [None]:
df_censor.loc[67142]

441343      （2014）德城民初字第3298号
241749     （2017）津0104民初5274号
569153     （2019）豫0802民初2477号
345645      （2018）甘0821民初712号
406762    （2017）粤0303民初19203号
946       （2018）粤1971民初14858号
182432      （2015）启民初字第00259号
257176       （2016）冀06民辖终283号
370299        （2016）川03民辖终83号
597843     （2015）鞍西民二初字第1453号
Name: 案号, dtype: object

In [26]:
df_soe[df_soe.ent_name.apply(lambda x: x.find('龙湖置业')!=-1)]

Unnamed: 0,ent_name,company_id,soe,soe_connected,central_soe,central_soe_connected
858294,泰安世纪龙湖置业有限公司,71262658.0,,1.0,,
961910,濮阳市万瑞龙湖置业有限公司,65714658.0,,1.0,,
984333,盐城龙湖置业有限公司,65531039.0,,1.0,,


In [23]:
df_soe[df_soe.ent_name.apply(lambda x: x.find('长通汽车')!=-1)]

Unnamed: 0,ent_name,company_id,soe,soe_connected,central_soe,central_soe_connected
771175,楚雄长通汽车销售服务有限公司,14529286.0,,1.0,,1.0
1127305,重庆市永川区长通汽车维修服务中心配件销售部,,1.0,,,
1133719,重庆广达长通汽车销售服务中心,,1.0,,,


In [3]:
df_temp = pd.read_excel(wd+'Data/civil_docx2019_only_cleaning/extracted_civil_docx2019_only_1.xlsx')

In [4]:
df_temp.案号.sample(10)

332700     （2017）浙0402民初1832号
391598       （2019）川0180司惩21号
633786       （2018）辽0502执异19号
985251     （2016）粤1971民初8728号
97409      （2017）冀1003民初5825号
547723     （2019）豫0102财保1105号
82630        （2015）宁民保字第1252号
656558        （2014）德执行字第176号
445892    （2018）京0113民初16564号
379668     （2018）京0105民初1403号
Name: 案号, dtype: object