In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def clean(df):
    # scale Fare
    scaler = preprocessing.StandardScaler()
    fare_scale_param = scaler.fit(df['Fare'])
    df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)

    # dummies
    for index, row in df.iterrows():
        if 0 <= row.Age < 14:
            df.loc[index, 'Age'] = 'child'
        elif 14 <= row.Age < 18:
            df.loc[index, 'Age'] = 'teen'
        elif 18 <= row.Age < 30:
            df.loc[index, 'Age'] = 'youth'
        elif 30 <= row.Age < 45:
            df.loc[index, 'Age'] = 'mid'
        elif 45 <= row.Age < 55:
            df.loc[index, 'Age'] = 'midold'
        elif row.Age >= 55:
            df.loc[index, 'Age'] = 'old'

    dummies_age = pd.get_dummies(df['Age'], prefix='Age')
    dummies_sex = pd.get_dummies(df['Sex'], prefix='Sex')
    dummies_pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
    df = pd.concat([df, dummies_age, dummies_sex, dummies_pclass], axis=1)
    df.drop(['Pclass', 'Age', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Fare'], axis=1, inplace=True)

    return df

train_df = clean(train_df)
test_df.loc[152, 'Fare'] = 7.0
test_df = clean(test_df)



In [31]:
from sklearn import linear_model

# 用正则取出我们要的属性值
train_df = train_df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Sex_.*|Pclass_.*')
np = train_df.as_matrix()

# y即Survival结果
y = np[:, 0]

# X即特征属性值
X = np[:, 1:]

# fit到LogisticRegressoion之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [38]:
test = test_df.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':test_df['PassengerId'].as_matrix(), 'Survived':predictions.astype(int)})
result.to_csv("logistic_regression_predictions.csv", index=False)

In [39]:
result_df = pd.read_csv('logistic_regression_predictions.csv')
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [None]:
# slice df
rows = df.shape[0]
train_df = df[0:int(rows*0.6)]
cv_df = df[int(rows*0.6):int(rows*0.8)]
test_df = df[int(rows*0.8):rows]

train_df

In [183]:
result_df

Unnamed: 0,PassengerId,Predict_Survived,Survived,right
0,713,0,1,wrong
1,714,0,0,right
2,715,0,0,right
3,716,0,0,right
4,717,1,1,right
5,718,1,1,right
6,719,0,0,right
7,720,0,0,right
8,721,1,1,right
9,722,0,0,right


In [186]:
result_df['right'].value_counts()

right    150
wrong     29
Name: right, dtype: int64

29/179

In [187]:
a=29/179

In [188]:
a

0.16201117318435754