## Import

In [1]:
import pandas as pd

In [2]:
sample1 = pd.read_csv("uc_one_sample_1_reviewed.csv", encoding="utf-8-sig", engine='python')
sample2 = pd.read_csv("uc_one_sample_2_reviewed.csv", encoding="utf-8-sig", engine='python')
sample3 = pd.read_csv("uc_one_sample_3_reviewed.csv", encoding="utf-8-sig", engine='python')
sample4 = pd.read_csv("uc_one_sample_4_reviewed.csv", encoding="utf-8-sig", engine='python')
sample5 = pd.read_csv("uc_one_sample_5_reviewed.csv", encoding="utf-8-sig", engine='python')

## Merge All Reviewed Samples

In [3]:
whole_sample = sample1.append([sample2, sample3, sample4, sample5], ignore_index=True, sort=True)

In [4]:
# check if the merge is correct
len(whole_sample) == len(sample1) + len(sample2) + len(sample3) + len(sample4) + len(sample5)

True

In [5]:
# rename and reorder the columns
whole_sample.rename(columns={"Spam": "spam", "False Positive Phrase": "false_positive_phrase"}, inplace=True)
whole_sample = whole_sample[['article_content', 'article_title', 'entity_hits', 
                             'entity_name', 'story_id', 'story_source', 'spam', 
                             'false_positive_phrase']]

In [6]:
"""
spam column can contain 4 type of values: n/a, 0, 1, 2:
n/a: the variation is English, we decided to skip those records
0: the variation and article title/contents aligned
1: one place of mistake existing in the article content
2: two places of mistake existing in the article content
"""
whole_sample['spam'].isnull().sum()

133

In [7]:
# drop the n/a
whole_sample = whole_sample.dropna(subset=['spam', "false_positive_phrase"], how='all').sort_values(['entity_hits', 'spam']).reset_index(drop=True)
# make the spam values to be int 
whole_sample["spam"] = whole_sample.spam.astype(int)

In [8]:
whole_sample.groupby('entity_hits').nunique()['spam'].reset_index().sort_values('spam', ascending=False)

Unnamed: 0,entity_hits,spam
55,"[""时代中国""]",3
85,"[""越秀""]",3
0,"[""万科""]",2
82,"[""融创""]",2
50,"[""新城发展""]",2
56,"[""時代中國""]",2
58,"[""朗诗""]",2
43,"[""富力地产""]",2
62,"[""泰禾""]",2
66,"[""碧桂园""]",2


## Data Analysis


### Change the Spam = 2 to Spam = 1

In [9]:
sta_sample = whole_sample.copy()

In [10]:
sta_sample.loc[sta_sample[sta_sample.spam == 2].index, "spam"] = 1

### Error Rate by entity_hits

In [11]:
per_sample = pd.pivot_table(sta_sample, values='spam', index="entity_hits", aggfunc={'spam': ["count", sum]}).reset_index()
per_sample.rename(columns={"count": "num_records", "sum": "num_error"}, inplace=True)
per_sample["percentage"] = per_sample["num_error"] / per_sample["num_records"]
per_sample.sort_values("percentage", ascending=False).head(20)

Unnamed: 0,entity_hits,num_records,num_error,percentage
84,"[""融創""]",2,2,1.0
26,"[""力高""]",129,127,0.984496
55,"[""时代中国""]",162,155,0.95679
77,"[""花樣年""]",7,6,0.857143
82,"[""融创""]",479,352,0.734864
85,"[""越秀""]",88,62,0.704545
28,"[""北京建设""]",10,7,0.7
13,"[""仁恒置地""]",3,2,0.666667
86,"[""路劲""]",5,3,0.6
56,"[""時代中國""]",5,3,0.6


### Error Rate by Length of entity_hits

In [12]:
per_sample['num'] = per_sample['entity_hits'].apply(lambda x: len(x[2:-2]))

In [13]:
per_sample[per_sample.num == 2].sort_values('percentage', ascending=False)

Unnamed: 0,entity_hits,num_records,num_error,percentage,num
84,"[""融創""]",2,2,1.0,2
26,"[""力高""]",129,127,0.984496,2
82,"[""融创""]",479,352,0.734864,2
85,"[""越秀""]",88,62,0.704545,2
86,"[""路劲""]",5,3,0.6,2
58,"[""朗诗""]",5,1,0.2,2
62,"[""泰禾""]",41,3,0.073171,2
0,"[""万科""]",864,12,0.013889,2


In [14]:
per_sample[per_sample.num == 2].percentage.agg(['max', 'mean', 'min'])

max     1.000000
mean    0.538871
min     0.013889
Name: percentage, dtype: float64

In [15]:
per_sample[per_sample.num == 3].sort_values('percentage', ascending=False)

Unnamed: 0,entity_hits,num_records,num_error,percentage,num
77,"[""花樣年""]",7,6,0.857143,3
30,"[""华南城""]",13,7,0.538462,3
75,"[""花样年""]",28,13,0.464286,3
68,"[""碧桂園""]",6,1,0.166667,3
14,"[""佳兆业""]",52,2,0.038462,3
91,"[""阳光城""]",75,2,0.026667,3
66,"[""碧桂园""]",231,1,0.004329,3
16,"[""佳兆業""]",2,0,0.0,3


In [16]:
per_sample[per_sample.num == 3].percentage.agg(['max', 'mean', 'min'])

max     0.857143
mean    0.262002
min     0.000000
Name: percentage, dtype: float64

In [17]:
per_sample[per_sample.num == 4].sort_values('percentage', ascending=False).head(10)

Unnamed: 0,entity_hits,num_records,num_error,percentage,num
55,"[""时代中国""]",162,155,0.95679,4
28,"[""北京建设""]",10,7,0.7,4
13,"[""仁恒置地""]",3,2,0.666667,4
56,"[""時代中國""]",5,3,0.6,4
50,"[""新城发展""]",12,3,0.25,4
32,"[""华夏幸福""]",159,16,0.100629,4
43,"[""富力地产""]",29,1,0.034483,4
2,"[""世茂集团""]",7,0,0.0,4
63,"[""泰禾集团""]",61,0,0.0,4
51,"[""新城發展""]",1,0,0.0,4


In [18]:
per_sample[per_sample.num == 4].percentage.agg(['max', 'mean', 'min'])

max     0.956790
mean    0.059082
min     0.000000
Name: percentage, dtype: float64

## Conclusion

1. As the variation has longer length, it will be more accurate in terms of extracting relative information
2. Suggested removing variations as their error rate is greater than 50%:
    - 融創
    - 力高
    - 时代中国
    - 花樣年
    - 融创
    - 越秀
    - 北京建设
    - 仁恒置地
    - 路劲
    - 時代中國
    - 华南城