In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm_notebook
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("../2-数据转换/data_nor.csv")
print("data_shape: ",data.shape)

data_shape:  (72550, 26)


In [3]:
category_columns = [ col for col in data.columns if data[col].dtype == 'object' ]
category_columns.remove("summary")
print(len(category_columns))

11


In [4]:
# LabelEncoder
for col in tqdm_notebook(data[category_columns]):
    encoder = LabelEncoder()
    encoder.fit(list(data[col].values))
    data[col] = encoder.transform(list(data[col].values))
print(data)

  0%|          | 0/72550 [00:00<?, ?it/s]

       extended  country_txt  region_txt  provstate   city   latitude  \
0             0          157           6        567   4035  37.005105   
1             0          157           6        252  15269  37.791927   
2             0          157           6       1564  12390  43.076592   
3             0          157           6       1564  12390  43.072950   
4             0          157           6        328   5528  39.758968   
...         ...          ...         ...        ...    ...        ...   
72545         0          164           5         14    219  12.849085   
72546         0          105           8        663   3154  28.709444   
72547         0          164           5         36  17467  15.305307   
72548         0            0           8        634   9328  34.523842   
72549         0          157           6        653  21608  37.688889   

        longitude  specificity  vicinity  \
0      -89.176269          1.0         0   
1     -122.225906          1.0     

In [5]:
data.to_csv('data_stand.csv', index=False, encoding='utf_8_sig')

In [6]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk', 'summary', 'success'], axis=1)
y = data['risk']
kf = KFold(n_splits=10, random_state=0, shuffle=True)
splits = kf.split(X, y)
next(iter(splits))

(array([    0,     1,     2, ..., 72547, 72548, 72549]),
 array([    3,    14,    18, ..., 72525, 72542, 72544]))

In [7]:
# RF
from sklearn.ensemble import RandomForestClassifier

RFC_roc_scores = []
RFC_feature_importances = pd.DataFrame(index=None)
RFC_feature_importances['Feature'] = data.drop(['risk', 'summary', 'success'], axis=1).columns

for k, (train_indices, test_indices) in enumerate(splits):
    print("第 %d 折\n" % (k + 1))
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
    RFC = RandomForestClassifier(random_state=0, oob_score=True)
    RFC.fit(X_train, y_train)
    RFC_feature_importances[f'fold_{k+1}'] = RFC.feature_importances_
    y_pred = RFC.predict(X_test)
    print(RFC.oob_score_)
    
#     roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovo', average="weighted")
#     print(f" Fold {k + 1} | AUC_ROC: { roc_auc * 100}%")
#     RFC_roc_scores.append(roc_auc)
    
# print(f'average roc score: {np.mean(RFC_roc_scores)}')

第 1 折

0.6690098782448886
第 2 折

0.669300865303622
第 3 折

0.6680450264185619
第 4 折

0.6674477371927406
第 5 折

0.667463052301095
第 6 折

0.6673405314342599
第 7 折

0.6694080710621028
第 8 折

0.6704494984302014
第 9 折

0.6698675243127346


In [8]:
%matplotlib 
%config InlineBackend.figure_format = 'svg'
RFC_feature_importances['Feature importance'] = RFC_feature_importances[[f'fold_{k+1}' for k in range(kf.n_splits-1)]].mean(axis=1)
plt.figure(figsize=(14, 7))
sns.barplot(data=RFC_feature_importances.sort_values(by='Feature importance', ascending=False).head(15),
            x = 'Feature importance', y = 'Feature')
plt.title("15 top features importnce over 10 folds average.".format(kf.n_splits-1))

Using matplotlib backend: TkAgg


Text(0.5, 1.0, '15 top features importnce over 10 folds average.')

In [9]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE

In [10]:
data = pd.read_csv("data_stand.csv")
data.shape
X = data.drop(columns=['risk', 'summary', 'success'], axis=1)
Y = data['risk']

In [15]:
RFC_model = RandomForestClassifier(random_state=0, oob_score=True)                   
#feature_importances_基于杂质的特征重要性
RFC_feature_importances = RFC_model.fit(X, Y).feature_importances_    # 特征重要性
print("模型特征因子重要性：")
print(RFC_feature_importances)

模型特征因子重要性：
[0.0081725  0.02735895 0.01615544 0.05399555 0.07930386 0.09568525
 0.09942881 0.01944931 0.01002489 0.00281546 0.01536817 0.00713939
 0.07042258 0.06486277 0.10705994 0.03081289 0.04206323 0.00085552
 0.02617444 0.04873877 0.08971154 0.00960291 0.07479782]


In [17]:
score = []
feature_number = len(X.columns)
#range创建的结果不包括feature_number+1
for i in tqdm_notebook(range(1, feature_number+1, 1)):
    #fit_transform适合数据，然后转换它
    X_transform = RFE(RFC_model, n_features_to_select=i, step=1).fit_transform(X, Y)
    # 交叉验证
    RFE_score = cross_val_score(RFC_model, X_transform, Y, cv=3).mean()
    # 交叉验证结果保存到列表
    score.append(RFE_score)
print('输出所有分类结果',score)
print('输出最优分类结果',max(score),'对应的特征数量', (score.index(max(score))*1)+1)# 输出最优分类结果和对应的特征数量

  0%|          | 0/23 [00:00<?, ?it/s]

输出所有分类结果 [0.3473052002354579, 0.36624396012601507, 0.4682153411940864, 0.5551485234565616, 0.5620816637019669, 0.5849349531303577, 0.5900623892842802, 0.5864097249212433, 0.59666473914784, 0.5984841460932467, 0.6000830386564292, 0.5995316917157026, 0.5975192644389176, 0.6000830204178748, 0.601502682706691, 0.6032394826794536, 0.6031705477831728, 0.6022746055847631, 0.6038734981479454, 0.6061064181486712, 0.6076777197686876, 0.6073056048121915, 0.6077053175513116]
输出最优分类结果 0.6077053175513116 对应的特征数量 23


In [11]:
feature_number = len(X.columns)
score = [0.3473052002354579, 0.36624396012601507, 0.4682153411940864, 0.5551485234565616, 0.5620816637019669, 0.5849349531303577, 0.5900623892842802, 0.5864097249212433, 0.59666473914784, 0.5984841460932467, 0.6000830386564292, 0.5995316917157026, 0.5975192644389176, 0.6000830204178748, 0.601502682706691, 0.6032394826794536, 0.6031705477831728, 0.6022746055847631, 0.6038734981479454, 0.6061064181486712, 0.6076777197686876, 0.6073056048121915, 0.6077053175513116]

In [12]:
%matplotlib

plt.figure(figsize=[10, 5])
plt.plot(range(1, feature_number + 1, 1), score)
plt.xticks(range(1, feature_number + 1, 2))

plt.title("Recursive feature elimination")
plt.xlabel("Number of features") 
plt.ylabel("Accuracy")

plt.show()

Using matplotlib backend: TkAgg


In [20]:
plt.figure(figsize=[20, 5])
plt.plot(range(1, feature_number + 1, 1), score)
plt.xticks(range(1, feature_number + 1, 2))
plt.show()

In [22]:
# n_features_to_select表示筛选最终特征数量，step表示每次排除一个特征
# selector1 = RFE(RFC_model, n_features_to_select=(score.index(max(score))*1)+1, step=1).fit(X, Y)
selector1 = RFE(RFC_model, n_features_to_select=7, step=1).fit(X, Y)
#所选特征的掩码和排序
print('RFE所选特征的掩码',selector1.support_)
print('RFE特征排除排序',selector1.ranking_)
print('RFE选择特征数量',selector1.n_features_)
X_transform1 = selector1.transform(X)
RFE_optimal_score =cross_val_score(RFC_model, X_transform1, Y, cv=3).mean()
print('RFE最优特征交叉验证分数',RFE_optimal_score)

RFE所选特征的掩码 [False False False False  True  True  True False False False False False
  True False  True False False False False False  True False  True]
RFE特征排除排序 [14  7 11  2  1  1  1  9 13 16 10 15  1  3  1  6  4 17  8  5  1 12  1]
RFE选择特征数量 7
RFE最优特征交叉验证分数 0.5900623892842802


In [24]:
print(selector1.feature_names_in_)
features = []
for i in range(1, feature_number, 1):
    if selector1.ranking_[i] == 1:
        features.append(selector1.feature_names_in_[i])
print(features)

['extended' 'country_txt' 'region_txt' 'provstate' 'city' 'latitude'
 'longitude' 'specificity' 'vicinity' 'doubtterr' 'multiple' 'suicide'
 'attacktype1_txt' 'targtype1_txt' 'targsubtype1_txt' 'natlty1_txt'
 'gname' 'individual' 'nperps' 'weaptype1_txt' 'weapsubtype1_txt'
 'ishostkid' 'date']
['city', 'latitude', 'longitude', 'attacktype1_txt', 'targsubtype1_txt', 'weapsubtype1_txt', 'date']


In [25]:
data_ = data.loc[:, features]
print(data_)

        city   latitude   longitude  attacktype1_txt  targsubtype1_txt  \
0       4035  37.005105  -89.176269                0                73   
1      15269  37.791927 -122.225906                2                18   
2      12390  43.076592  -89.412488                3                50   
3      12390  43.072950  -89.386694                3                27   
4       5528  39.758968 -104.876305                3                50   
...      ...        ...         ...              ...               ...   
72545    219  12.849085   45.037275                2                35   
72546   3154  28.709444   82.163611                3                28   
72547  17467  15.305307   43.019490                2                32   
72548   9328  34.523842   69.140304                0                83   
72549  21608  37.688889  -97.336111                1                71   

       weapsubtype1_txt  date  
0                    27     4  
1                    26     5  
2              

In [26]:
data_.to_csv('data_columns.csv', index=False, encoding='utf_8_sig')