## Import

In [1]:
import pandas as pd
import glob

In [2]:
sample1 = pd.read_csv("uc_one_sample_1_reviewed.csv", encoding = "utf-8-sig", engine='python')
sample2 = pd.read_csv("uc_one_sample_2_reviewed.csv", encoding = "utf-8-sig", engine='python')
sample3 = pd.read_csv("uc_one_sample_3_reviewed.csv", encoding = "utf-8-sig", engine='python')
sample4 = pd.read_csv("uc_one_sample_4_reviewed.csv", encoding = "utf-8-sig", engine='python')
sample5 = pd.read_csv("uc_one_sample_5_reviewed.csv", encoding = "utf-8-sig", engine='python')

## Merge all samples

In [3]:
whole_sample = sample1.append([sample2,sample3,sample4,sample5], ignore_index=True,sort=True)

In [4]:
#check if the merge is correct
whole_sample.shape[0] == len(sample1)+len(sample2)+len(sample3)+len(sample4)+len(sample5)

True

In [5]:
whole_sample.columns

Index(['402', 'False Positive Phrase', 'Spam', 'Unnamed: 6', 'article_content',
       'article_title', 'entity_hits', 'entity_name', 'story_id',
       'story_source'],
      dtype='object')

In [6]:
whole_sample.rename(columns = {"Spam": "spam",
                              "False Positive Phrase": "false_positive_phrase"}, inplace=True)

In [7]:
whole_sample = whole_sample[['article_content', 'article_title', 'entity_hits', 
                             'entity_name', 'story_id', 'story_source', 'spam', 
                             "false_positive_phrase"]]

In [8]:
# Count the number of n/a values
whole_sample['spam'].isnull().sum()

131

In [9]:
# Drop the n/a
whole_sample = whole_sample.dropna(subset=['spam', "false_positive_phrase"], how='all').sort_values(['entity_hits', 'spam']).reset_index(drop=True)

In [10]:
# Drop the invalid 
whole_sample.drop([0,1], axis=0, inplace=True)

In [11]:
whole_sample.reset_index(drop=True).head()

Unnamed: 0,article_content,article_title,entity_hits,entity_name,story_id,story_source,spam,false_positive_phrase
0,"这边发债圈钱,那边出手抢地,上市房企正掀起前所未有的超大规模发债热潮。羊城晚报记者粗略统计,...",9月以来房企发债超千亿 四季度或成拿地窗口期,"[""万科""]",CHINA VANKE CO LTD H,ObjectId(5cfe8d40c50f3e0ecce7e18c),sina.com.cn,0,0.0
1,中国楼市最疯狂的人都已经变得小心翼翼！ 2016年05月18日 11:21 来源： 编辑： ...,ä¸­å½æ¥¼å¸æ ç-¯ççäººé½å·²ç» å å¾--å° ...,"[""万科""]",CHINA VANKE CO LTD H,ObjectId(5d011a4567cd6e1df72b14a3),eastmoney.com,0,0.0
2,首页 > 财经频道 > 正文 知名私募把脉2019：多重博弈下的预期再平衡 2019年01月...,知名私募把脉2019：多重博弈下的预期再平衡,"[""万科""]",CHINA VANKE CO LTD H,ObjectId(5d05b42f43431100ef8bcff7),eastmoney.com,0,0.0
3,千亿活水或驰援A股 蓝筹基金迎布局良机 2016年05月31日 07:32 来源： 采编： ...,千亿活水或驰援A股 蓝筹迎布局良机,"[""万科""]",CHINA VANKE CO LTD H,ObjectId(5d011a0c67cd6e1d892b0a58),eastmoney.com,0,
4,东方财富网APP 方便，快捷 手机查看财经快讯 专业，丰富 一手掌握市场脉搏 手机上阅读文章...,三利好或催生行情大变局,"[""万科""]",CHINA VANKE CO LTD H,ObjectId(5d010ba867cd6e0a702b2df1),eastmoney.com,0,


In [12]:
# Unique the data type of "Spam"
whole_sample["spam"] = whole_sample.spam.astype(int)

## Do some statistics analysis of the result
### Change the Spam == 2 to Spam == 1

In [13]:
sta_sample = whole_sample.copy()

In [14]:
idx = sta_sample[sta_sample.spam ==2].index
sta_sample.loc[idx,"spam"] = 1

### Calculate the false percentage based on each different "entity_hits"

In [15]:
per_sample = pd.pivot_table(sta_sample, values='spam', index="entity_hits",aggfunc={'spam':["count", sum]}).reset_index()
per_sample["percentage"] = per_sample["sum"]/per_sample["count"]
per_sample.sort_values(by = "percentage", ascending = False).head(10)

Unnamed: 0,entity_hits,count,sum,percentage
84,"[""融創""]",2,2,1.0
26,"[""力高""]",129,127,0.984496
55,"[""时代中国""]",162,155,0.95679
77,"[""花樣年""]",7,6,0.857143
82,"[""融创""]",479,352,0.734864
85,"[""越秀""]",88,62,0.704545
28,"[""北京建设""]",10,7,0.7
13,"[""仁恒置地""]",3,2,0.666667
86,"[""路劲""]",5,3,0.6
56,"[""時代中國""]",5,3,0.6


### The false percentage based on different "entity_hits"

In [16]:
per_sample['entity_hits'] = per_sample['entity_hits'].str.replace('\[','').str.replace('\]','').str.replace('\"','')
per_sample['entity_hits'] = per_sample['entity_hits'].str.replace('',' ').str.split()

In [17]:
#count the values of the the entity_hits
per_sample['num'] = per_sample['entity_hits'].apply(lambda x: len(x))

### The result of different nums of the "entity_hits"

In [18]:
per_sample[per_sample.num == 2].sort_values('percentage', ascending= False)

Unnamed: 0,entity_hits,count,sum,percentage,num
84,"[融, 創]",2,2,1.0,2
26,"[力, 高]",129,127,0.984496,2
82,"[融, 创]",479,352,0.734864,2
85,"[越, 秀]",88,62,0.704545,2
86,"[路, 劲]",5,3,0.6,2
58,"[朗, 诗]",5,1,0.2,2
62,"[泰, 禾]",41,3,0.073171,2
0,"[万, 科]",864,12,0.013889,2


In [19]:
per_sample[per_sample.num == 2].percentage.agg(['max','mean','min','average'])

max        1.000000
mean       0.538871
min        0.013889
average    0.538871
Name: percentage, dtype: float64

In [20]:
per_sample[per_sample.num == 3].sort_values('percentage', ascending= False)

Unnamed: 0,entity_hits,count,sum,percentage,num
77,"[花, 樣, 年]",7,6,0.857143,3
30,"[华, 南, 城]",13,7,0.538462,3
75,"[花, 样, 年]",28,13,0.464286,3
68,"[碧, 桂, 園]",6,1,0.166667,3
14,"[佳, 兆, 业]",52,2,0.038462,3
91,"[阳, 光, 城]",75,2,0.026667,3
66,"[碧, 桂, 园]",231,1,0.004329,3
16,"[佳, 兆, 業]",2,0,0.0,3


In [21]:
per_sample[per_sample.num == 3].percentage.agg(['max','mean','min','average'])

max        0.857143
mean       0.262002
min        0.000000
average    0.262002
Name: percentage, dtype: float64

In [22]:
per_sample[per_sample.num == 4].sort_values('percentage', ascending= False).head()

Unnamed: 0,entity_hits,count,sum,percentage,num
55,"[时, 代, 中, 国]",162,155,0.95679,4
28,"[北, 京, 建, 设]",10,7,0.7,4
13,"[仁, 恒, 置, 地]",3,2,0.666667,4
56,"[時, 代, 中, 國]",5,3,0.6,4
50,"[新, 城, 发, 展]",12,3,0.25,4


In [23]:
per_sample[per_sample.num == 4].percentage.agg(['max','mean','min','average'])

max        0.956790
mean       0.059082
min        0.000000
average    0.059082
Name: percentage, dtype: float64