In [1]:
import pandas as pd
import numpy as np

# 1、载入数据
train = pd.read_csv('dataset/train.csv', parse_dates=['Dates'])
test = pd.read_csv('dataset/test.csv', parse_dates=['Dates'])

# 2、数据预处理，对category进行编码
from sklearn import preprocessing
label = preprocessing.LabelEncoder() #创建LabelEncoder对象：实例化LabelEncoder类的对象。
crime = label.fit_transform(train.Category)  #对train中category列进行编号，作为标签。

# 3、对Dates、DayOfWeek、PdDistrict三个特征进行二值化处理
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = pd.get_dummies(train.Dates.dt.hour)

train_data = pd.concat([days, district, hour], axis=1)
train_data['crime'] = crime  # 在DataFrame数据结构表的最后加一列，在本例中相当于标签

# 针对测试集做同样的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = pd.get_dummies(test.Dates.dt.hour)
test_data = pd.concat([days, district, hour], axis=1)

# 4、将样本几何分割成训练集和验证集(70%训练,30%验证)
from sklearn.model_selection import train_test_split
training, validation = train_test_split(train_data, train_size=0.7)

# 将列名转换为字符串
training.columns = training.columns.astype(str)
validation.columns = validation.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

# 5、朴素贝叶斯
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
feature_list = training.columns.tolist()
feature_list = feature_list[:len(feature_list) - 1]  # 选取的特征列，最后一列是标签，不能要
model.fit(training[feature_list], training['crime'])  # 根据给定的训练数据拟合模型

predicted = np.array(model.predict_proba(validation[feature_list]))  # 在验证集上的结果
print("朴素贝叶斯log损失为 %f" % (log_loss(validation['crime'], predicted)))  # 多分类的对数损失


# 6、在测试集上运行
test_predicted = np.array(model.predict_proba(test_data[feature_list]))  # model为朴素贝叶斯

# 7、保存结果
col_names = np.sort(train['Category'].unique())  # 唯一，按首字母从小到大排序
result = pd.DataFrame(data=test_predicted, columns=col_names)  # 合成DataFrame数据结构的表
result['Id'] = test['Id'].astype(int)  # 在最后加一列result['Id']
result.to_csv('test_output.csv', index=False)  # 保存
print("finish")


朴素贝叶斯log损失为 2.584671
finish


In [6]:
import pandas as pd
import numpy as np

train = pd.read_csv('dataset/train.csv', parse_dates=['Dates'])
test = pd.read_csv('dataset/test.csv', parse_dates=['Dates'])

A = pd. Series(train['Address'])
print(A.describe(),'\n')

D = pd. Series(train['Dates'])
print(D.describe(),'\n')

Day = pd. Series(train['DayOfWeek'])
print(Day.describe(),'\n')

P = pd. Series(train['PdDistrict'])
print(P.describe(),'\n')

count                     878049
unique                     23228
top       800 Block of BRYANT ST
freq                       26533
Name: Address, dtype: object 

count                           878049
mean     2009-03-16 08:25:41.991847168
min                2003-01-06 00:01:00
25%                2006-01-11 03:00:00
50%                2009-03-07 16:00:00
75%                2012-06-11 10:13:00
max                2015-05-13 23:53:00
Name: Dates, dtype: object 

count     878049
unique         7
top       Friday
freq      133734
Name: DayOfWeek, dtype: object 

count       878049
unique          10
top       SOUTHERN
freq        157182
Name: PdDistrict, dtype: object 

