In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
import time
import jieba

In [2]:
## 设置字符集，防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False

In [3]:
# jupyter展示图片，非内嵌显示; 这段代码不属于python代码； 相当于是jupyter的工具参数设置
# tk: 显示出来，inline：内嵌显示，默认为inline
%matplotlib tk

In [4]:
# 1. 文件数据读取
df = pd.read_csv('../data/result_process01', sep=',', header=None, names=['from', 'to', 'date', 'content', 'label'])
df.head(10)

Unnamed: 0,from,to,date,content,label
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0
2,=?GB2312?B?1cW6o8TP?= <jian@163.con>,xing@ccert.edu.cn,Sun 14 Aug 2005 10:17:57 +0800,尊敬的贵公司(财务/经理)负责人您好！我是深圳金海实业有限公司（广州。东莞）等省市有分公司。...,1.0
3,=?GB2312?B?tPq/qreixrE=?= <pan@12.com>,ling@ccert.edu.cn,Sun 14 Aug 2005 10:19:02 +0800,贵公司负责人(经理/财务）您好：深圳市华龙公司受多家公司委托向外低点代开部分增值税电脑发票（...,1.0
4,mei <mei@dghhkjk.com>,tang@ccert.edu.cn,Sun 14 Aug 2005 10:21:22 +0800,这是一封HTML格式信件！---------------------------------...,1.0
5,"ke@163.com"" <chunyang-sz@163.com>",yuan@ccert.edu.cn,Sun 14 Aug 2005 10:22:10 +0800,TO：贵公司经理、财务您好！深圳市春洋贸易有限公司（东莞分公司）我司本着互惠互利的优势和良好...,1.0
6,hong <hong@jdl.ac.cn>,yu@ccert.edu.cn,Sun 14 Aug 2005 10:23:37 +0800,那他为什么不愿意起诉，既然这样了！起诉后也有充分的理由！MM莫不是还生活在电影中，个人认为这...,0.0
7,=?GB2312?B?wbrPyMn6?= <jiang@tom.com>,li@ccert.edu.cn,Sun 14 Aug 2005 10:26:36 +0800,尊敬的负责人（经理／财务）：您好！我是深圳伟仕嘉贸易有公司：兴办贸易、物资供销，实力雄厚；有...,1.0
8,han <han@davidchans.com>,lai@ccert.edu.cn,Sun 14 Aug 2005 10:27:40 +0800,您好 以下是特别为阁下发的香港信息(图片、景点、BBS等) 不知道阁下是否喜...希望没有打...,1.0
9,hou <hou@jdl.ac.cn>,li@ccert.edu.cn,Sun 14 Aug 2005 10:31:20 +0800,我觉得，负债不要紧，最重要的是能负得起这个责任来，欠了那么多钱，至少对当初拿出爱心来的网友们...,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64620 entries, 0 to 64619
Data columns (total 5 columns):
from       64615 non-null object
to         64620 non-null object
date       64619 non-null object
content    64284 non-null object
label      64619 non-null float64
dtypes: float64(1), object(4)
memory usage: 2.5+ MB


# 发件箱和收件箱相关特征处理

In [6]:
# 2. 特征工程1 => 提取发件人和收件人的邮件服务器地址
def extract_email_server_address(str1):
    it = re.findall(r"@([A-Za-z0-9]+\.[A-Za-z0-9\.]+)", str(str1))
    result = ''
    if len(it) > 0:
        result = it[0]
    if not result:
        result = 'unknown'
    return result

df['to_address'] = pd.Series(map(lambda to_email: extract_email_server_address(to_email), df['to']))
df['from_address'] = pd.Series(map(lambda from_email: extract_email_server_address(from_email), df['from']))

df.head(4)

Unnamed: 0,from,to,date,content,label,to_address,from_address
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0,ccert.edu.cn,jdl.ac.cn
2,=?GB2312?B?1cW6o8TP?= <jian@163.con>,xing@ccert.edu.cn,Sun 14 Aug 2005 10:17:57 +0800,尊敬的贵公司(财务/经理)负责人您好！我是深圳金海实业有限公司（广州。东莞）等省市有分公司。...,1.0,ccert.edu.cn,163.con
3,=?GB2312?B?tPq/qreixrE=?= <pan@12.com>,ling@ccert.edu.cn,Sun 14 Aug 2005 10:19:02 +0800,贵公司负责人(经理/财务）您好：深圳市华龙公司受多家公司委托向外低点代开部分增值税电脑发票（...,1.0,ccert.edu.cn,12.com


In [7]:
# 2. 特征工程1 => 查看邮件服务器的数量
print("========to address=======================")
print(df.to_address.value_counts().head(5))
print("总邮件接收服务器类别数量为:" + str(df.to_address.unique().shape))
print("========from address=======================")
print(df.from_address.value_counts().head(5))
print("总邮件发送服务器类别数量为:" + str(df.from_address.unique().shape))

ccert.edu.cn    64407
unknown           193
yahoo.com.cn        8
163.net             3
quanso.com          2
Name: to_address, dtype: int64
总邮件接收服务器类别数量为:(12,)
163.com                  7500
mail.tsinghua.edu.cn     6498
126.com                  5822
tom.com                  4075
mails.tsinghua.edu.cn    3205
Name: from_address, dtype: int64
总邮件发送服务器类别数量为:(3566,)


In [8]:
# 对于发件箱而言，考虑一下发件数目比较多的服务器进行特殊考虑，将从这些服务器过来的数据作为一个特殊的类别
from_address_df = df.from_address.value_counts().to_frame()
len_less_10_from_address_count = from_address_df[from_address_df.from_address < 1000].shape
print("发送邮件数量小于指定值封的服务器数量为:" + str(len_less_10_from_address_count))

tdf = from_address_df[from_address_df.from_address >= 1000]
tdf = tdf.reset_index()
print("发送邮件数目大于等于指定值的邮箱分别是:")
print(np.array(tdf['index']))

# 对于每个发件箱进行处理
# df['from_163'] = pd.Series(map(lambda s: 1 if s=='163.com' else 0, df['from_address']))
# df['from_tsinghua'] = pd.Series(map(lambda s: int(s == 'mail.tsinghua.edu.cn' or s == 'mails.tsinghua.edu.cn'), df['from_address']))
# df['from_126'] = pd.Series(map(lambda s: int(s == '126.com'), df['from_address']))
# df['from_yahoo'] = pd.Series(map(lambda s: 1 if s=='yahoo.com.cn' else 0, df['from_address']))
# df['from_21cn'] = pd.Series(map(lambda s: 1 if s=='21cn.com' else 0, df['from_address']))
# df['from_tom'] = pd.Series(map(lambda s: int(s == 'tom.com'), df['from_address']))
# df['from_12'] = pd.Series(map(lambda s: int(s == '12.com'), df['from_address']))
# df['from_cernet'] = pd.Series(map(lambda s: int(s == 'cernet.com'), df['from_address']))
# df['from_sohu'] = pd.Series(map(lambda s: int(s == 'sohu.com'), df['from_address']))
# df['from_unknown'] = pd.Series(map(lambda s: int(s == 'unknown'), df['from_address']))


# df.head(10)

发送邮件数量小于指定值封的服务器数量为:(3555, 1)
发送邮件数目大于等于指定值的邮箱分别是:
['163.com' 'mail.tsinghua.edu.cn' '126.com' 'tom.com'
 'mails.tsinghua.edu.cn' 'unknown' '12.com' 'cernet.com' 'sohu.com'
 'yahoo.com.cn' '21cn.com']


In [9]:
# 查看一下各个邮箱服务器发送邮件的情况
print("所有发送邮件的情况")
print(df.from_address.value_counts().head(11))
print("所有正常邮件的发送情况")
print(df[df.label == 0.0].from_address.value_counts().head(11))
print("所有异常邮件的发送情况")
print(df[df.label == 1.0].from_address.value_counts().head(11))

所有发送邮件的情况
163.com                  7500
mail.tsinghua.edu.cn     6498
126.com                  5822
tom.com                  4075
mails.tsinghua.edu.cn    3205
unknown                  2099
12.com                   2057
cernet.com               1537
sohu.com                 1430
yahoo.com.cn             1374
21cn.com                 1166
Name: from_address, dtype: int64
所有正常邮件的发送情况
mail.tsinghua.edu.cn      6495
mails.tsinghua.edu.cn     3203
126.com                   1569
cernet.com                1537
163.com                    971
sohu.com                   834
jdl.ac.cn                  775
mail.nisac.gov.cn          669
scan.ccert.edu.cn          523
ns.6test.edu.cn            452
stu.ee.tsinghua.edu.cn     420
Name: from_address, dtype: int64
所有异常邮件的发送情况
163.com         6529
126.com         4253
tom.com         4075
12.com          2057
unknown         1996
21cn.com        1166
yahoo.com.cn    1061
sina.com         771
sohu.com         596
yahoo.com        461
sian.com         43

In [10]:
# 对于每个发件箱进行处理(因为下列这些邮件的发送服务器产生的邮件中，正常邮件和异常邮件的比例比较悬殊)
# 如果邮件是从163.com、126.com、21cn.com、tom.com、12.com等这些服务器发送过来的，那么有很大的可能直接属于垃圾邮件
# 如果邮件是从mail.tsinghua.edu.cn、mails.tsinghua.edu.cn、cernet.com等这些服务器发送过程了，那么有很大可能直接属于正常邮件
df['from_163'] = pd.Series(map(lambda s: 1 if s=='163.com' else 0, df['from_address']))
df['from_tsinghua'] = pd.Series(map(lambda s: int(s == 'mail.tsinghua.edu.cn' or s == 'mails.tsinghua.edu.cn'), df['from_address']))
df['from_126'] = pd.Series(map(lambda s: int(s == '126.com'), df['from_address']))
df['from_yahoo'] = pd.Series(map(lambda s: 1 if s=='yahoo.com.cn' else 0, df['from_address']))
df['from_21cn'] = pd.Series(map(lambda s: 1 if s=='21cn.com' else 0, df['from_address']))
df['from_tom'] = pd.Series(map(lambda s: int(s == 'tom.com'), df['from_address']))
df['from_12'] = pd.Series(map(lambda s: int(s == '12.com'), df['from_address']))
df['from_cernet'] = pd.Series(map(lambda s: int(s == 'cernet.com'), df['from_address']))
df['from_sohu'] = pd.Series(map(lambda s: int(s == 'sohu.com'), df['from_address']))
df['from_unknown'] = pd.Series(map(lambda s: int(s == 'unknown'), df['from_address']))

# 时间特征属性的处理

In [11]:
# 首先看一下现有的数据内容
df.head(1)

Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,from_yahoo,from_21cn,from_tom,from_12,from_cernet,from_sohu,from_unknown
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown,0,0,0,0,0,0,0,0,0,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64620 entries, 0 to 64619
Data columns (total 17 columns):
from             64615 non-null object
to               64620 non-null object
date             64619 non-null object
content          64284 non-null object
label            64619 non-null float64
to_address       64620 non-null object
from_address     64620 non-null object
from_163         64620 non-null int64
from_tsinghua    64620 non-null int64
from_126         64620 non-null int64
from_yahoo       64620 non-null int64
from_21cn        64620 non-null int64
from_tom         64620 non-null int64
from_12          64620 non-null int64
from_cernet      64620 non-null int64
from_sohu        64620 non-null int64
from_unknown     64620 non-null int64
dtypes: float64(1), int64(10), object(6)
memory usage: 8.4+ MB


In [13]:
# 查看一下date数据
df.date

0             Tue 30 Aug 2005 10:08:15 +0800
1             Sun 14 Aug 2005 10:16:47 +0800
2             Sun 14 Aug 2005 10:17:57 +0800
3             Sun 14 Aug 2005 10:19:02 +0800
4             Sun 14 Aug 2005 10:21:22 +0800
5             Sun 14 Aug 2005 10:22:10 +0800
6             Sun 14 Aug 2005 10:23:37 +0800
7             Sun 14 Aug 2005 10:26:36 +0800
8             Sun 14 Aug 2005 10:27:40 +0800
9             Sun 14 Aug 2005 10:31:20 +0800
10            Sun 14 Aug 2005 10:31:44 +0800
11            Sun 14 Aug 2005 10:33:05 +0800
12            Sun 14 Aug 2005 10:33:30 +0800
13            Sun 14 Aug 2005 02:40:30 -0600
14            Sun 14 Aug 2005 10:35:06 +0800
15            Sun 14 Aug 2005 10:37:57 +0800
16            Sun 14 Aug 2005 10:44:13 +0800
17            Tue 16 Aug 2005 10:43:31 +0800
18            Sun 14 Aug 2005 10:43:51 +0800
19            Sun 14 Aug 2005 10:46:49 +0800
20            Sun 14 Aug 2005 10:48:25 +0800
21            Sat 16 Jul 2005 00:23:51 +0800
22        

In [14]:
# 看一下原始数据中日期数据到底有哪些形式
dates = np.unique(list(map(lambda t: str(t).strip(), df.date)))
date_lengths = np.unique(list(map(lambda t: len(t), dates)))
print("各个字符串的长度:")
print(date_lengths)
print("打印各个长度对应的字符串值(每个长度最多打印10条)")
for length in date_lengths:
    tmplist = list(filter(lambda t: len(t) == length, dates))
    print(length, end='\t')
    print(len(tmplist), end='\t')
    print(tmplist[:10])
    print()

各个字符串的长度:
[ 3  7 16 19 21 23 24 26 27 28 29 30 31 32 33 34 35 36 45 46 57 58 61 62]
打印各个长度对应的字符串值(每个长度最多打印10条)
3	1	['nan']

7	1	['unknown']

16	2	['2005-9-2 上午10:55', '2005-9-2 上午11:04']

19	1	['Sep 23 2005 1:04 AM']

21	1	['August 24 2005 5:00pm']

23	9	['Fri 2 Sep 2005 08:17:50', 'Fri 2 Sep 2005 18:01:40', 'Fri 2 Sep 2005 23:08:06', 'Sat 1 Oct 2005 02:47:47', 'Sat 1 Oct 2005 03:52:10', 'Thu 1 Sep 2005 09:42:01', 'Thu 8 Sep 2005 14:18:39', 'Thu 8 Sep 2005 18:35:36', 'Tue 6 Sep 2005 11:05:27']

24	40	['Fri 16 Sep 2005 04:35:34', 'Fri 16 Sep 2005 10:20:18', 'Fri 16 Sep 2005 11:16:04', 'Fri 16 Sep 2005 17:57:28', 'Fri 16 Sep 2005 21:03:05', 'Fri 19 Aug 2005 10:16:38', 'Fri 23 Sep 2005 17:46:34', 'Fri 26 Aug 2005 22:44:48', 'Fri 30 Sep 2005 00:50:39', 'Fri 30 Sep 2005 16:47:54']

26	31	['Sat 1 Oct 2005 00:12:07 UT', 'Sat 1 Oct 2005 00:39:02 UT', 'Sat 1 Oct 2005 00:51:59 UT', 'Sat 1 Oct 2005 01:08:02 UT', 'Sat 1 Oct 2005 01:18:25 UT', 'Sat 1 Oct 2005 01:48:25 UT', 'Sat 1 Oct 2005 03:42:40 

In [15]:
# 3. 特征工程2 => 时间提取
def extract_email_date(str1):
    if not isinstance(str1, str):
        str1 = str(str1)

    str_len = len(str1)
    week = ""
    hour = ""
    # 0表示上午[8,12]，1表示下午[13,18],2表示晚上[19,23],3表示凌晨[0,7]
    time_quantum = ""

    if str_len < 10:
        # unknown
        week = "unknown"
        hour = "unknown"
        time_quantum = "unknown"
    elif str_len == 16:
        # 2005-9-2 上午10:55, 2005-9-2 上午11:04
        rex = r"(\d{2}):\d{2}"
        it = re.findall(rex, str1)
        if len(it) == 1:
            hour = it[0]
        else:
            hour = "unknown"
        week = "Fri"
        time_quantum = "0"
        pass
    elif str_len == 19:
        # Sep 23 2005 1:04 AM
        week = "Sep"
        hour = "01"
        time_quantum = "3"
        pass
    elif str_len == 21:
        # August 24 2005 5:00pm
        week = "Wed"
        hour = "17"
        time_quantum = "1"
        pass
    else:
        rex = r"([A-Za-z]+\d?[A-Za-z]*) .*?(\d{2}):\d{2}:\d{2}.*"
        it = re.findall(rex, str1)
        if len(it) == 1 and len(it[0]) == 2:
            week = it[0][0][-3:]
            hour = it[0][1]
            int_hour = int(hour)
            if int_hour < 8:
                time_quantum = "3"
            elif int_hour < 13:
                time_quantum = "0"
            elif int_hour < 19:
                time_quantum = "1"
            else:
                time_quantum = "2"
            pass
        else:
            week = "unknown"
            hour = "unknown"
            time_quantum = "unknown"

    week = week.lower()
    hour = hour.lower()
    time_quantum = time_quantum.lower()
    return (week, hour, time_quantum)

# 数据转换
date_time_extract_result = list(map(lambda st: extract_email_date(st), df['date']) )
df['date_week'] = pd.Series(map(lambda t: t[0], date_time_extract_result))
df['date_hour'] = pd.Series(map(lambda t: t[1], date_time_extract_result))
df['date_time_quantum'] = pd.Series(map(lambda t: t[2], date_time_extract_result))
df.head(4)

Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,from_yahoo,from_21cn,from_tom,from_12,from_cernet,from_sohu,from_unknown,date_week,date_hour,date_time_quantum
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown,0,0,0,0,0,0,0,0,0,1,tue,10,0
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0,ccert.edu.cn,jdl.ac.cn,0,0,0,0,0,0,0,0,0,0,sun,10,0
2,=?GB2312?B?1cW6o8TP?= <jian@163.con>,xing@ccert.edu.cn,Sun 14 Aug 2005 10:17:57 +0800,尊敬的贵公司(财务/经理)负责人您好！我是深圳金海实业有限公司（广州。东莞）等省市有分公司。...,1.0,ccert.edu.cn,163.con,0,0,0,0,0,0,0,0,0,0,sun,10,0
3,=?GB2312?B?tPq/qreixrE=?= <pan@12.com>,ling@ccert.edu.cn,Sun 14 Aug 2005 10:19:02 +0800,贵公司负责人(经理/财务）您好：深圳市华龙公司受多家公司委托向外低点代开部分增值税电脑发票（...,1.0,ccert.edu.cn,12.com,0,0,0,0,0,0,1,0,0,0,sun,10,0


In [16]:
print("======星期属性字段的描述==========")
print(df.date_week.value_counts().head(8))
print(df[['date_week', 'label']].groupby(['date_week', 'label'])['label'].count())

fri        10859
sat        10316
thu         9780
sun         8509
wed         8410
tue         8132
mon         8059
unknown      554
Name: date_week, dtype: int64
date_week  label
fri        0.0      3884
           1.0      6975
mon        0.0      2568
           1.0      5491
sat        0.0      3681
           1.0      6635
sep        0.0         1
sun        0.0      2785
           1.0      5724
thu        0.0      3330
           1.0      6450
tue        0.0      2733
           1.0      5399
unknown    1.0       553
wed        0.0      2784
           1.0      5626
Name: label, dtype: int64


In [17]:
print("======小时属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_hour', 'label']].groupby(['date_hour', 'label'])['label'].count())

19    2835
16    2772
15    2750
Name: date_hour, dtype: int64
date_hour  label
00         0.0       904
           1.0      1716
01         0.0       925
           1.0      1791
02         0.0       868
           1.0      1736
03         0.0       839
           1.0      1682
04         0.0       824
           1.0      1771
05         0.0       822
           1.0      1791
06         0.0       758
           1.0      1748
07         0.0       863
           1.0      1775
08         0.0       801
           1.0      1732
09         0.0       896
           1.0      1795
10         0.0       874
           1.0      1847
11         0.0       889
           1.0      1779
12         0.0       936
           1.0      1740
13         0.0       909
           1.0      1712
14         0.0       945
           1.0      1757
15         0.0       979
           1.0      1771
16         0.0       988
           1.0      1784
17         0.0       940
           1.0      1802
18         0.0      

In [18]:
print("======时间段属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_time_quantum', 'label']].groupby(['date_time_quantum', 'label'])['label'].count())

19    2835
16    2772
15    2750
Name: date_hour, dtype: int64
date_time_quantum  label
0                  0.0       4396
                   1.0       8893
1                  0.0       5756
                   1.0      10570
2                  0.0       4811
                   1.0       8827
3                  0.0       6803
                   1.0      14010
unknown            1.0        553
Name: label, dtype: int64


In [19]:
# 添加是否有时间
df['has_not_date'] = df.apply(lambda c: 1 if c['date_week'] == 'unknown' else 0, axis=1)
df.head(4)
df[df.date_week == 'unknown'].head(5)

Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,...,from_21cn,from_tom,from_12,from_cernet,from_sohu,from_unknown,date_week,date_hour,date_time_quantum,has_not_date
54,rao@tjipe.edu.cn,qing@ccert.edu.cn,unknown,信件已收到！！,1.0,ccert.edu.cn,tjipe.edu.cn,0,0,0,...,0,0,0,0,0,0,unknown,unknown,unknown,1
60,eNet硅谷动力<ke@mail.enet.com.cn>,lai@ccert.edu.cn,unknown,请使用HTML方式打开本邮件 谢谢。IT女性首页| 服饰潮流| 美容前线| 情感婚恋| 娱人...,1.0,ccert.edu.cn,mail.enet.com.cn,0,0,0,...,0,0,0,0,0,0,unknown,unknown,unknown,1
138,<xiao@msn.com>,xu@ccert.edu.cn,unknown,532,1.0,ccert.edu.cn,msn.com,0,0,0,...,0,0,0,0,0,0,unknown,unknown,unknown,1
385,=?GB2312?B?QUQxNjjQxc+izfg=?= <wu@ad168.com>,zhu@ccert.edu.cn,unknown,=?GB2312?B?vLq1xLXn19PU09a+o6E=?= To: ...,1.0,ccert.edu.cn,ad168.com,0,0,0,...,0,0,0,0,0,0,unknown,unknown,unknown,1
463,<shi@msn.com>,tian@ccert.edu.cn,unknown,999,1.0,ccert.edu.cn,msn.com,0,0,0,...,0,0,0,0,0,0,unknown,unknown,unknown,1


# 邮件内容文本特征数据处理

In [20]:
## 将文本类型全部转换为str类型，然后进行分词操作
df['content'] = df['content'].astype('str')
# jieba添加分词字典 jieba.load_userdict("userdict.txt")
df['jieba_cut_content'] = pd.Series(list(map(lambda st: "  ".join(jieba.cut(st)), df['content'])))
df.head(4)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ibf\AppData\Local\Temp\jieba.cache
Loading model cost 1.561 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,...,from_tom,from_12,from_cernet,from_sohu,from_unknown,date_week,date_hour,date_time_quantum,has_not_date,jieba_cut_content
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown,0,0,0,...,0,0,0,0,1,tue,10,0,0,非 财务 纠淼 牟 莆 窆 芾 - （ 沙盘 模拟 ） - - ...
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0,ccert.edu.cn,jdl.ac.cn,0,0,0,...,0,0,0,0,0,sun,10,0,0,讲 的 是 孔子 后人 的 故事 。 一个 老 领导 回到 家乡 ...
2,=?GB2312?B?1cW6o8TP?= <jian@163.con>,xing@ccert.edu.cn,Sun 14 Aug 2005 10:17:57 +0800,尊敬的贵公司(财务/经理)负责人您好！我是深圳金海实业有限公司（广州。东莞）等省市有分公司。...,1.0,ccert.edu.cn,163.con,0,0,0,...,0,0,0,0,0,sun,10,0,0,尊敬 的 贵 公司 ( 财务 / 经理 ) 负责人 您好 ！ 我 ...
3,=?GB2312?B?tPq/qreixrE=?= <pan@12.com>,ling@ccert.edu.cn,Sun 14 Aug 2005 10:19:02 +0800,贵公司负责人(经理/财务）您好：深圳市华龙公司受多家公司委托向外低点代开部分增值税电脑发票（...,1.0,ccert.edu.cn,12.com,0,0,0,...,0,1,0,0,0,sun,10,0,0,贵 公司 负责人 ( 经理 / 财务 ） 您好 ： 深圳市 华龙 公...


# 邮件内容长度对于目标属性的影响处理

In [21]:
## 特征工程四 ==> 邮件长度对是否是垃圾邮件的影响
def precess_content_length(lg):
    if lg <= 10:
        return 0
    elif lg <= 100:
        return 1
    elif lg <= 500:
        return 2
    elif lg <= 1000:
        return 3
    elif lg <= 1500:
        return 4
    elif lg <= 2000:
        return 5
    elif lg <= 2500:
        return 6
    elif lg <=  3000:
        return 7
    elif lg <= 4000:
        return 8
    elif lg <= 5000:
        return 9
    elif lg <= 10000:
        return 10
    elif lg <= 20000:
        return 11
    elif lg <= 30000:
        return 12
    elif lg <= 50000:
        return 13
    else:
        return 14

# 获取邮件内容的长度以及长度类型/长度区间
df['content_length'] = pd.Series(map(lambda st: len(st), df['content']))
df['content_length_type'] = pd.Series(map(lambda st: precess_content_length(st), df['content_length']))

# 以长度区间和目标属性分组，然后统计一下样本数目
df2 = df.groupby(['content_length_type', 'label'])['label'].agg(['count']).reset_index()
df3 = df2[df2.label == 1][['content_length_type', 'count']].rename(columns={'count':'c1'})
df4 = df2[df2.label == 0][['content_length_type', 'count']].rename(columns={'count':'c2'})
df5 = pd.merge(df3, df4)
df5['c1_rage'] = df5.apply(lambda r: r['c1'] / (r['c1'] + r['c2']), axis=1)
df5['c2_rage'] = df5.apply(lambda r: r['c2'] / (r['c1'] + r['c2']), axis=1)

print(df5.head(30))
# 画图
plt.plot(df5['content_length_type'], df5['c1_rage'], label=u'垃圾邮件比例')
plt.plot(df5['content_length_type'], df5['c2_rage'], label=u'正常邮件比例')
plt.grid(True)
plt.legend(loc = 0)
plt.show()

    content_length_type     c1     c2   c1_rage   c2_rage
0                     0    465     24  0.950920  0.049080
1                     1   3878    170  0.958004  0.041996
2                     2  23922  17336  0.579815  0.420185
3                     3   6249   2183  0.741105  0.258895
4                     4   2513    635  0.798285  0.201715
5                     5   1590    362  0.814549  0.185451
6                     6   1129    221  0.836296  0.163704
7                     7    996    135  0.880637  0.119363
8                     8   1040    188  0.846906  0.153094
9                     9    429    109  0.797398  0.202602
10                   10    508    219  0.698762  0.301238
11                   11    103    144  0.417004  0.582996
12                   12     19     40  0.322034  0.677966


In [22]:
# 基于发现的特征，将邮件的长度给区间化
df['content_length_type_0'] = pd.Series(map(lambda s: int(s == 0 or s == 1), df['content_length_type']))
df['content_length_type_1'] = pd.Series(map(lambda s: int(s == 2), df['content_length_type']))
df['content_length_type_2'] = pd.Series(map(lambda s: int(s > 2 and s < 11), df['content_length_type']))
df['content_length_type_3'] = pd.Series(map(lambda s: int(s == 11 or s == 12), df['content_length_type']))
df.head(5)

Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,...,date_hour,date_time_quantum,has_not_date,jieba_cut_content,content_length,content_length_type,content_length_type_0,content_length_type_1,content_length_type_2,content_length_type_3
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown,0,0,0,...,10,0,0,非 财务 纠淼 牟 莆 窆 芾 - （ 沙盘 模拟 ） - - ...,1798,5,0,0,1,0
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0,ccert.edu.cn,jdl.ac.cn,0,0,0,...,10,0,0,讲 的 是 孔子 后人 的 故事 。 一个 老 领导 回到 家乡 ...,193,2,0,1,0,0
2,=?GB2312?B?1cW6o8TP?= <jian@163.con>,xing@ccert.edu.cn,Sun 14 Aug 2005 10:17:57 +0800,尊敬的贵公司(财务/经理)负责人您好！我是深圳金海实业有限公司（广州。东莞）等省市有分公司。...,1.0,ccert.edu.cn,163.con,0,0,0,...,10,0,0,尊敬 的 贵 公司 ( 财务 / 经理 ) 负责人 您好 ！ 我 ...,255,2,0,1,0,0
3,=?GB2312?B?tPq/qreixrE=?= <pan@12.com>,ling@ccert.edu.cn,Sun 14 Aug 2005 10:19:02 +0800,贵公司负责人(经理/财务）您好：深圳市华龙公司受多家公司委托向外低点代开部分增值税电脑发票（...,1.0,ccert.edu.cn,12.com,0,0,0,...,10,0,0,贵 公司 负责人 ( 经理 / 财务 ） 您好 ： 深圳市 华龙 公...,177,2,0,1,0,0
4,mei <mei@dghhkjk.com>,tang@ccert.edu.cn,Sun 14 Aug 2005 10:21:22 +0800,这是一封HTML格式信件！---------------------------------...,1.0,ccert.edu.cn,dghhkjk.com,0,0,0,...,10,0,0,这是 一封 HTML 格式 信件 ！ - - - - - - - ...,15969,11,0,0,0,1


In [23]:
## 特征工程四 ==> 添加信号量
def process_content_sema(x):
    if x > 10000:
        return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1) - np.log(abs(x - 10000)) + 1
    else:
        return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1) + 1

a = np.arange(1,20000)
plt.plot(a, list(map(lambda t: process_content_sema(t), a)), label=u'信息量')
plt.grid(True)
plt.legend(loc = 0)
plt.show()

df['content_sema'] = list(map(lambda st: process_content_sema(st), df['content_length']))
df.head(2)

Unnamed: 0,from,to,date,content,label,to_address,from_address,from_163,from_tsinghua,from_126,...,date_time_quantum,has_not_date,jieba_cut_content,content_length,content_length_type,content_length_type_0,content_length_type_1,content_length_type_2,content_length_type_3,content_sema
0,yan<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-（沙盘模拟）------如何运用财务岳硖岣吖芾砑ㄐ[课 程 背 景]每一...,1.0,ccert.edu.cn,unknown,0,0,0,...,0,0,非 财务 纠淼 牟 莆 窆 芾 - （ 沙盘 模拟 ） - - ...,1798,5,0,0,1,0,8.456151
1,pan <pan@jdl.ac.cn>,shi@ccert.edu.cn,Sun 14 Aug 2005 10:16:47 +0800,讲的是孔子后人的故事。一个老领导回到家乡，跟儿子感情不和，跟贪财的孙子孔为本和睦。老领导的弟...,0.0,ccert.edu.cn,jdl.ac.cn,0,0,0,...,0,0,讲 的 是 孔子 后人 的 故事 。 一个 老 领导 回到 家乡 ...,193,2,0,1,0,0,7.486084


In [24]:
## 查看列名称
df.dtypes

from                      object
to                        object
date                      object
content                   object
label                    float64
to_address                object
from_address              object
from_163                   int64
from_tsinghua              int64
from_126                   int64
from_yahoo                 int64
from_21cn                  int64
from_tom                   int64
from_12                    int64
from_cernet                int64
from_sohu                  int64
from_unknown               int64
date_week                 object
date_hour                 object
date_time_quantum         object
has_not_date               int64
jieba_cut_content         object
content_length             int64
content_length_type        int64
content_length_type_0      int64
content_length_type_1      int64
content_length_type_2      int64
content_length_type_3      int64
content_sema             float64
dtype: object

In [25]:
# 获取需要的列, 把不需要列进行删除操作
df.drop(['from', 'to', 'date', 'content', 'to_address', 
         'from_address', 'date_week', 'date_hour', 'date_time_quantum', 
         'content_length', 'content_length_type'], 1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64620 entries, 0 to 64619
Data columns (total 18 columns):
label                    64619 non-null float64
from_163                 64620 non-null int64
from_tsinghua            64620 non-null int64
from_126                 64620 non-null int64
from_yahoo               64620 non-null int64
from_21cn                64620 non-null int64
from_tom                 64620 non-null int64
from_12                  64620 non-null int64
from_cernet              64620 non-null int64
from_sohu                64620 non-null int64
from_unknown             64620 non-null int64
has_not_date             64620 non-null int64
jieba_cut_content        64620 non-null object
content_length_type_0    64620 non-null int64
content_length_type_1    64620 non-null int64
content_length_type_2    64620 non-null int64
content_length_type_3    64620 non-null int64
content_sema             64620 non-null float64
dtypes: float64(2), int64(15), object(1)
memory usage: 8.9

In [26]:
df.head(5)

Unnamed: 0,label,from_163,from_tsinghua,from_126,from_yahoo,from_21cn,from_tom,from_12,from_cernet,from_sohu,from_unknown,has_not_date,jieba_cut_content,content_length_type_0,content_length_type_1,content_length_type_2,content_length_type_3,content_sema
0,1.0,0,0,0,0,0,0,0,0,0,1,0,非 财务 纠淼 牟 莆 窆 芾 - （ 沙盘 模拟 ） - - ...,0,0,1,0,8.456151
1,0.0,0,0,0,0,0,0,0,0,0,0,0,讲 的 是 孔子 后人 的 故事 。 一个 老 领导 回到 家乡 ...,0,1,0,0,7.486084
2,1.0,0,0,0,0,0,0,0,0,0,0,0,尊敬 的 贵 公司 ( 财务 / 经理 ) 负责人 您好 ！ 我 ...,0,1,0,0,7.175171
3,1.0,0,0,0,0,0,0,1,0,0,0,0,贵 公司 负责人 ( 经理 / 财务 ） 您好 ： 深圳市 华龙 公...,0,1,0,0,7.565682
4,1.0,0,0,0,0,0,0,0,0,0,0,0,这是 一封 HTML 格式 信件 ！ - - - - - - - ...,0,0,0,1,2.063409


In [27]:
# 删除为空的数据
df = df.dropna(axis=0)

In [28]:
# 结果输出csv文件
df.to_csv("../data/result_process02", encoding='utf-8', index=False)