In [None]:
!ls /home/aistudio/data
!ls /home/aistudio/work

data41574


# 数据预处理

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 
import itertools
import seaborn as sns
import sklearn.externals as sk_externals
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('/home/aistudio/data/data41574/train.csv')
data.shape

**字段说明（label是否作弊，0为正常，1位作弊）**

![](https://ai-studio-static-online.cdn.bcebos.com/c5a7a8f10ce44593a6dd3310cda0352efea701c63a854ee395a2be52d0fec0ab)

In [None]:
data.head()  # 不填参数默认头五行

In [None]:
count_classes = pd.value_counts(data['label'], sort = True).sort_index()  
# 统计label这一列中有多少不同的值，并排列出来
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')  # 欺诈类别直方图
plt.xlabel('Class')  # 0为正常，1位作弊
plt.ylabel('num')

**这里看出来样本数还是比较均匀的，下面可以直接去掉无用特征进行训练**

**操作系统全是安卓，无效特征，系统版本号跟着，删掉。**

**语言、屏幕高宽分辨率，和欺诈没有关系，删掉。**

In [None]:
# 删除无用的所在的列
data = data.drop(['os','osv','lan','dev_height','dev_width','dev_ppi','version','fea_hash'],axis=1)

In [None]:
# 删除完了再看一眼
data.head() 

In [None]:
X = data.loc[:, data.columns != 'label']   # 取出所有属性，不包含label的这一列
y = data.loc[:, data.columns == 'label']   # y等于label这一列

number_records_fraud = len(data[data.label == 1])        # 计算出label这一列一号元素有多少个
fraud_indices = np.array(data[data.label == 1].index)    # 取出label这一列所有等于1的行索引
normal_indices = data[data.label == 0].index             # 取出label这一列所有等于0的行索引

random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
# 随机选择和1这个属性样本个数相同的0样本
random_normal_indices = np.array(random_normal_indices)  # 转换成numpy的格式

under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) # 将正负样本拼接在一起
under_sample_data = data.iloc[under_sample_indices,:]  # 下采样数据集

X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'label'] #下采样数据集的数据
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'label'] #下采样数据集的label

print("下采样后的正常样本比例: ", len(under_sample_data[under_sample_data.label == 0])/len(under_sample_data))#打印正样本数目
print("下采样后的诈骗样本比例: ", len(under_sample_data[under_sample_data.label == 1])/len(under_sample_data))#打印负样本数目
print("下采样后的总样本数量: ", len(under_sample_data))#打印总数量

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
# 对整个训练集进行切分，testsize表示切分的测试集大小，state=0在切分时进行数据重洗牌 的标识位

print("训练集: ", len(X_train))
print("测试集: ", len(X_test))
print("总样本数: ", len(X_train)+len(X_test))

# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)#对我们刚刚的下采样数据进行切分
print("\n****下采样之后****")
print("训练集: ", len(X_train_undersample))
print("测试集:  ", len(X_test_undersample))
print("总样本数:  ", len(X_train_undersample)+len(X_test_undersample))

# 开始训练

In [13]:
def printing_Kfold_scores(x_train_data,y_train_data):
    fold = KFold(5,shuffle=False)          # 参数为输入的几折交叉验证
    c_param_range = [0.01,0.1,1,10,100]    # 传入选择正则化的参数

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')  # 第一个for循环用来打印在每个正则化参数下的输出
        recall_accs = []
        for iteration, indices in enumerate(fold.split(x_train_data)):
            lr = LogisticRegression(C = c_param, penalty = 'l1')#传入正则化参数
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')
    best_c = results_table.loc[results_table['Mean recall score'].astype(float).idxmax()]['C_parameter']
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    return best_c

**数据量太大，所以我们选择前一万行来训练**

In [14]:
train_X,train_Y = X_train_undersample.iloc[0:10000,:],y_train_undersample.iloc[0:10000,:]

In [15]:
printing_Kfold_scores(train_X,train_Y)

KeyboardInterrupt: 

**c参数最好的是0.01，接下来保存模型**

In [None]:
log_reg = LogisticRegression(C = 0.01, penalty = 'l1')
log_reg.fit(train_X,train_Y)
sk_externals.joblib.dump(log_reg,'model.pickle')  # 保存模型

# 测试1

In [16]:
# 导入数据
test_data = pd.read_csv('/home/aistudio/data/data41574/test1.csv')
# 删除无用的所在的列
test_data = test_data.drop(['os','osv','lan','dev_height','dev_width','dev_ppi','version','fea_hash'],axis=1)
test_data = test_data.drop(test_data.columns[0], axis=1)  # 删掉第0列

# sid列挑出来作为test_y的序列
sid = test_data.loc[:, test_data.columns == 'sid']

In [17]:
sid.head()

Unnamed: 0,sid
0,1440682
1,1606824
2,1774642
3,1742535
4,1689686


In [18]:
test_data.head()

Unnamed: 0,android_id,apptype,carrier,media_id,ntt,package,sid,timestamp,location,fea1_hash,cus_type
0,317625,1181,46000.0,639,2.0,188,1440682,1559872000000.0,57,3872258917,658
1,435108,944,46003.0,704,6.0,221,1606824,1559739000000.0,23,129322164,943
2,0,1106,46000.0,39,2.0,1562,1774642,1559614000000.0,30,4226678391,411
3,451504,761,46000.0,54,2.0,9,1742535,1559668000000.0,65,3355419572,848
4,0,1001,46000.0,29,5.0,4,1689686,1559694000000.0,148,2644467751,411


In [19]:
type(test_data)

pandas.core.frame.DataFrame

In [21]:
model = sk_externals.joblib.load('model.pickle')  # 加载模型
label = model.predict(test_data)  # 预测结果作为label 这时候label是np类型

In [22]:
sid = np.array(sid) # sid转化成np类型

In [23]:
result_csv = []
for x,y in zip(sid,label):
    print(x,y)
    result_csv.append([x[0],y])
#result_csv = [sid,label]  # 最终输出的csv

[1201539] 1

In [24]:
result_csv

 [1606824, 0], [1774642, 0], [1742535, 0], [1689686, 1], [1219471, 1], [1537396, 0], [1406256, 1], [1612794, 0], [1200836, 1], [1965083, 0], [1516642, 1], [1264740, 0], [1379006, 1], [1618594, 1], [1361135, 1], [1429614, 1], [1505918, 1], [1854437, 1], [1339746, 0], [1963014, 0], [1138701, 1], [1005414, 0], [1040400, 1], [1423152, 1], [1686343, 1], [1938363, 1], [1203831, 0], [1668041, 0], [1580570, 0], [1121448, 1], [1848175, 0], [1893346, 1], [1941103, 0], [1971201, 0], [1086852, 1], [1882214, 0], [1148762, 0], [1740123, 1], [1978996, 0], [1313806, 1], [1696857, 0], [1940572, 1], [1311731, 1], [1742357, 1], [1396032, 0], [1896154, 0], [1138291, 1], [1773612, 0], [1420744, 0], [1761458, 0], [1692775, 1], [1542833, 0], [1470765, 1], [1899337, 0], [1030219, 1], [1342423, 1], [1711217, 1], [1851563, 0], [1285127, 0], [1835849, 0], [1505379, 1], [1608220, 0], [1216168, 0], [1976348, 1], [1215546, 0], [1217117, 1], [1970768,

In [27]:
with open('nmsl.txt','w') as f:
    for  i in result_csv:
        f.write(str(i)+'\n')

**End**