In [1]:

import xgboost as xgb
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


def read_data(path):
    y = []
    row = []
    col = []
    values = []
    r = 0       # 首行
    for d in open(path):
        d = d.strip().split()      # 以空格分开
        y.append(int(d[0]))
        d = d[1:]
        for c in d:
            key, value = c.split(':')
            row.append(r)
            col.append(int(key))
            values.append(float(value))
        r += 1
    x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
    y = np.array(y)
    return x, y


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print acc
    print tip + '正确率：\t', float(acc.sum()) / a.size

In [5]:
x, y = read_data('14.agaricus_train.txt')

In [7]:
print x
x.shape

[[ 0.  0.  1. ...,  1.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]]


(6513L, 126L)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)

# Logistic回归
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
show_accuracy(y_hat, y_test, 'Logistic回归 ')

[ True  True  True ...,  True  True  True]
Logistic回归 正确率：	1.0


In [12]:
# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 3, 'eta': 0.2, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(param, data_train, num_boost_round=10, evals=watch_list)
y_hat = bst.predict(data_test)
show_accuracy(y_hat, y_test, 'XGBoost ')

[0]	eval-merror:0.035687	train-merror:0.040696
[1]	eval-merror:0.002686	train-merror:0.003327
[2]	eval-merror:0.002686	train-merror:0.003327
[3]	eval-merror:0.002686	train-merror:0.003327
[4]	eval-merror:0.002686	train-merror:0.003327
[5]	eval-merror:0.002686	train-merror:0.003327
[6]	eval-merror:0.002686	train-merror:0.003327
[7]	eval-merror:0.000767	train-merror:0.001536
[8]	eval-merror:0.000767	train-merror:0.001536
[9]	eval-merror:0.000767	train-merror:0.001536
[ True  True  True ...,  True  True  True]
XGBoost 正确率：	0.999232540292


In [13]:
# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    acc_rate = 100 * float(acc.sum()) / a.size
    # print '%s正确率：%.3f%%' % (tip, acc_rate)
    return acc_rate


def load_data(file_name, is_train):
    data = pd.read_csv(file_name)  # 数据文件路径
    # print 'data.describe() = \n', data.describe()

    # 性别
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # 补齐船票价格缺失值
    if len(data.Fare[data.Fare.isnull()]) > 0:
        fare = np.zeros(3)
        for f in range(0, 3):
            fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
        for f in range(0, 3):  # loop 0 to 2
            data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]

    # 年龄：使用均值代替缺失值
    # mean_age = data['Age'].dropna().mean()
    # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    if is_train:
        # 年龄：使用随机森林预测年龄缺失值
        print '随机森林预测缺失年龄：--start--'
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]   # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄：--over--'
    else:
        print '随机森林预测缺失年龄2：--start--'
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄2：--over--'

    # 起始城市
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
    # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    # print data['Embarked']
    embarked_data = pd.get_dummies(data.Embarked)
    # print embarked_data
    # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    data = pd.concat([data, embarked_data], axis=1)
    print data.describe()
    data.to_csv('New_Data.csv')

    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    y = None
    if 'Survived' in data:
        y = data['Survived']

    x = np.array(x)
    y = np.array(y)

    # 思考：这样做，其实发生了什么？
    x = np.tile(x, (5, 1))
    y = np.tile(y, (5, ))
    if is_train:
        return x, y
    return x, data['PassengerId']


def write_result(c, c_type):
    file_name = '14.Titanic.test.csv'
    x, passenger_id = load_data(file_name, False)

    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()


if __name__ == "__main__":
    x, y = load_data('14.Titanic.train.csv', True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1)
    #
    # lr = LogisticRegression(penalty='l2')
    # lr.fit(x_train, y_train)
    # y_hat = lr.predict(x_test)
    # lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ')
    # # write_result(lr, 1)

    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(x_train, y_train)
    y_hat = rfc.predict(x_test)
    rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
    # write_result(rfc, 2)

    # XGBoost
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
             # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
    bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
    y_hat = bst.predict(data_test)
    # write_result(bst, 3)
    y_hat[y_hat > 0.5] = 1
    y_hat[~(y_hat > 0.5)] = 0
    xgb_rate = show_accuracy(y_hat, y_test, 'XGBoost ')

    #print 'Logistic回归：%.3f%%' % lr_rate
    print '随机森林：%.3f%%' % rfc_rate
    print 'XGBoost：%.3f%%' % xgb_rate


随机森林预测缺失年龄：--start--
随机森林预测缺失年龄：--over--
       PassengerId    Survived      Pclass         Sex         Age  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642    0.647587   29.665602   
std     257.353842    0.486592    0.836071    0.477989   13.737912   
min       1.000000    0.000000    1.000000    0.000000    0.420000   
25%     223.500000    0.000000    2.000000    0.000000   21.000000   
50%     446.000000    0.000000    3.000000    1.000000   28.000000   
75%     668.500000    1.000000    3.000000    1.000000   37.000000   
max     891.000000    1.000000    3.000000    1.000000   80.000000   

            SibSp       Parch        Fare  Embarked_C  Embarked_Q  Embarked_S  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.523008    0.381594   32.204208    0.188552    0.086420    0.722783   
std      1.102743    0.806057   49.693429    0.391372    0.281141    0.447876   
min 

In [51]:
file_name = '14.Titanic.train.csv'
data = pd.read_csv(file_name)  # 数据文件路径
print 'data.describe() = \n', data.describe()

data.describe() = 
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000         NaN    0.000000   
50%     446.000000    0.000000    3.000000         NaN    0.000000   
75%     668.500000    1.000000    3.000000         NaN    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [52]:
# 性别
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})
#equivalant to map(function,list) or use preprocessing
data['Sex']

0      1
1      0
2      0
3      0
4      1
5      1
6      1
7      1
8      0
9      0
10     0
11     0
12     1
13     1
14     0
15     0
16     1
17     1
18     0
19     0
20     1
21     1
22     0
23     1
24     0
25     0
26     1
27     1
28     0
29     1
      ..
861    1
862    0
863    0
864    1
865    0
866    0
867    1
868    1
869    1
870    1
871    0
872    1
873    1
874    0
875    0
876    1
877    1
878    1
879    0
880    0
881    1
882    0
883    1
884    1
885    0
886    1
887    0
888    0
889    1
890    1
Name: Sex, dtype: int64

In [53]:
#print data.Fare.isnull()
print len(data.Fare[data.Fare.isnull()])
#equivalant to 
print sum(data.Fare.isnull())

0
0


In [54]:
fare = np.zeros(3)
for f in range(0, 3):
    fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
for f in range(0, 3):  # loop 0 to 2
    data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
fare

array([ 60.2875,  14.25  ,   8.05  ])

In [55]:
 data.loc[data.Pclass == 1, 'Fare']

1       71.2833
3       53.1000
6       51.8625
11      26.5500
23      35.5000
27     263.0000
30      27.7208
31     146.5208
34      82.1708
35      52.0000
52      76.7292
54      61.9792
55      35.5000
61      80.0000
62      83.4750
64      27.7208
83      47.1000
88     263.0000
92      61.1750
96      34.6542
97      63.3583
102     77.2875
110     52.0000
118    247.5208
124     77.2875
136     26.2833
137     53.1000
139     79.2000
151     66.6000
155     61.3792
         ...   
763    120.0000
765     77.9583
766     39.6000
779    211.3375
781     57.0000
782     30.0000
789     79.2000
793     30.6958
796     25.9292
802    120.0000
806      0.0000
809     53.1000
815      0.0000
820     93.5000
822      0.0000
829     80.0000
835     83.1583
839     29.7000
842     31.0000
849     89.1042
853     39.4000
856    164.8667
857     26.5500
862     25.9292
867     50.4958
871     52.5542
872      5.0000
879     83.1583
887     30.0000
889     30.0000
Name: Fare, dtype: float

In [56]:
# 年龄：使用均值代替缺失值
# mean_age = data['Age'].dropna().mean()
# data.loc[(data.Age.isnull()), 'Age'] = mean_age
print data.columns
# 年龄：使用随机森林预测年龄缺失值
print '随机森林预测缺失年龄：--start--'
data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_exist = data_for_age.loc[(data.Age.notnull())]   # 年龄不缺失的数据
age_null = data_for_age.loc[(data.Age.isnull())]

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')
随机森林预测缺失年龄：--start--


In [64]:
#print data.Age.notnull()
print type(data_for_age)
print data_for_age.loc[(data.Age.notnull())] 

<class 'pandas.core.frame.DataFrame'>
      Age  Survived      Fare  Parch  SibSp  Pclass
0    22.0         0    7.2500      0      1       3
1    38.0         1   71.2833      0      1       1
2    26.0         1    7.9250      0      0       3
3    35.0         1   53.1000      0      1       1
4    35.0         0    8.0500      0      0       3
6    54.0         0   51.8625      0      0       1
7     2.0         0   21.0750      1      3       3
8    27.0         1   11.1333      2      0       3
9    14.0         1   30.0708      0      1       2
10    4.0         1   16.7000      1      1       3
11   58.0         1   26.5500      0      0       1
12   20.0         0    8.0500      0      0       3
13   39.0         0   31.2750      5      1       3
14   14.0         0    7.8542      0      0       3
15   55.0         1   16.0000      0      0       2
16    2.0         0   29.1250      1      4       3
18   31.0         0   18.0000      0      1       3
20   35.0         0   26.0

In [68]:
# 起始城市
data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
#print data['Embarked']
embarked_data = pd.get_dummies(data.Embarked)
print embarked_data

       C    Q    S    U
0    0.0  0.0  1.0  0.0
1    1.0  0.0  0.0  0.0
2    0.0  0.0  1.0  0.0
3    0.0  0.0  1.0  0.0
4    0.0  0.0  1.0  0.0
5    0.0  1.0  0.0  0.0
6    0.0  0.0  1.0  0.0
7    0.0  0.0  1.0  0.0
8    0.0  0.0  1.0  0.0
9    1.0  0.0  0.0  0.0
10   0.0  0.0  1.0  0.0
11   0.0  0.0  1.0  0.0
12   0.0  0.0  1.0  0.0
13   0.0  0.0  1.0  0.0
14   0.0  0.0  1.0  0.0
15   0.0  0.0  1.0  0.0
16   0.0  1.0  0.0  0.0
17   0.0  0.0  1.0  0.0
18   0.0  0.0  1.0  0.0
19   1.0  0.0  0.0  0.0
20   0.0  0.0  1.0  0.0
21   0.0  0.0  1.0  0.0
22   0.0  1.0  0.0  0.0
23   0.0  0.0  1.0  0.0
24   0.0  0.0  1.0  0.0
25   0.0  0.0  1.0  0.0
26   1.0  0.0  0.0  0.0
27   0.0  0.0  1.0  0.0
28   0.0  1.0  0.0  0.0
29   0.0  0.0  1.0  0.0
..   ...  ...  ...  ...
861  0.0  0.0  1.0  0.0
862  0.0  0.0  1.0  0.0
863  0.0  0.0  1.0  0.0
864  0.0  0.0  1.0  0.0
865  0.0  0.0  1.0  0.0
866  1.0  0.0  0.0  0.0
867  0.0  0.0  1.0  0.0
868  0.0  0.0  1.0  0.0
869  0.0  0.0  1.0  0.0
870  0.0  0.0  1

In [69]:
# embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
print embarked_data.columns

Index([u'Embarked_C', u'Embarked_Q', u'Embarked_S', u'Embarked_U'], dtype='object')


In [70]:
data = pd.concat([data, embarked_data], axis=1)
print data.describe()

       PassengerId    Survived      Pclass         Sex         Age  \
count   891.000000  891.000000  891.000000  891.000000  714.000000   
mean    446.000000    0.383838    2.308642    0.647587   29.699118   
std     257.353842    0.486592    0.836071    0.477990   14.526497   
min       1.000000    0.000000    1.000000    0.000000    0.420000   
25%     223.500000    0.000000    2.000000    0.000000         NaN   
50%     446.000000    0.000000    3.000000    1.000000         NaN   
75%     668.500000    1.000000    3.000000    1.000000         NaN   
max     891.000000    1.000000    3.000000    1.000000   80.000000   

            SibSp       Parch        Fare  Embarked_C  Embarked_Q  Embarked_S  \
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000   
mean     0.523008    0.381594   32.204208    0.188552    0.086420    0.722783   
std      1.102743    0.806057   49.693429    0.391372    0.281141    0.447876   
min      0.000000    0.000000    0.000000    

In [74]:
x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = None
if 'Survived' in data:
    y = data['Survived']

x = np.array(x)
y = np.array(y)
print x.shape,y.shape
# 思考：这样做，其实发生了什么？
x = np.tile(x, (5, 1))
y = np.tile(y, 5 )
print x.shape,y.shape

(891L, 9L) (891L,)
(4455L, 9L) (4455L,)
