In [1]:
import pandas as pd
import numpy as np
from RandomForest import RandomForestClassifier
from Decision_Tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
cat_cols=['Surname', 'Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited']
num_cols=['CreditScore','Balance', 'EstimatedSalary']
data = pd.read_csv("data/train.csv")
data['binned_Age']=pd.cut(data['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90, 100], labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
data.drop(['Age'], axis=1, inplace=True)
#对其他数值类特征进行分桶
#分为10个桶，均匀
for col in num_cols:
    data['binned_'+col]=pd.cut(data[col], bins=10, labels=False)
    data.drop([col], axis=1, inplace=True)
#对所有特征进行编码
for col in data.columns:
    data[col] = LabelEncoder().fit_transform(data[col])

In [15]:
train_data = data.drop(['id','CustomerId','Surname'], axis=1)
X = train_data.drop(['Exited'], axis=1)
y = train_data['Exited'].values
#train,test划分
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
X_train

In [9]:

forest = RandomForestClassifier(n_estimators=100,max_depth = 10,min_samples_split=5,max_features=7)
forest.fit(X_train, y_train)
y_pred = forest.predict_proba(X_test)
#将y_pred转换为0,1
y_pred = y_pred[:,1]
y_pred = np.where(y_pred>0.5,1,0)
accuracy = np.mean(y_pred == y_test)
accuracy

In [11]:
y_pred = forest.predict_proba(X_test)
#将y_pred转换为0,1
y_pred = forest.predict(X_test)
accuracy = np.mean(y_pred == y_test)
accuracy

In [13]:
test_data = pd.read_csv("data/test.csv")
test_data['binned_Age']=pd.cut(test_data['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90, 100], labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
test_data.drop(['Age'], axis=1, inplace=True)
#对其他数值类特征进行分桶
#分为10个桶，均匀
for col in num_cols:
    test_data['binned_'+col]=pd.cut(test_data[col], bins=10, labels=False)
    test_data.drop([col], axis=1, inplace=True)
#对所有特征进行编码
for col in test_data.columns.drop(['id','CustomerId','Surname']):
    test_data[col] = LabelEncoder().fit_transform(test_data[col])
y_pred = forest.predict_proba(test_data.drop(['id','CustomerId','Surname'], axis=1).values)
#输出为正类的概率
y_pred = y_pred[:,1]
y_pred

In [14]:
#将id作为第一列，预测结果为第二列，列名为id和Exited，导出为csv文件
result = pd.DataFrame({'id':test_data['id'],'Exited':y_pred})
result.to_csv('result.csv',index=False)

In [16]:
import pickle
#保存模型
with open('model.pkl','wb') as f:
    pickle.dump(forest,f)