### Read Data

In [936]:
import pandas as pd
import numpy as np

train_data_path = './train.csv'
test_data_path = './test.csv'
test_result_path = './gender_submission.csv'

train_df = pd.read_csv(train_data_path)
test_x_df = pd.read_csv(test_data_path)
test_y_df = pd.read_csv(test_result_path)

### Preview data

In [937]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [938]:
test_x_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [939]:
test_y_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Drop data

#### Useless data

*    PassengerId  
*    Name
*    Ticket
*    Cabin
*    Parch
*    Sibsp

In [940]:
train_df = train_df.drop('PassengerId', axis = 1)
train_df = train_df.drop('Name', axis = 1)
train_df = train_df.drop('Ticket', axis = 1)
train_df = train_df.drop('Cabin', axis = 1)
train_df = train_df.drop('Parch', axis = 1)
train_df = train_df.drop('SibSp', axis = 1)

test_x_df = test_x_df.drop('PassengerId', axis = 1)
test_x_df = test_x_df.drop('Name', axis = 1)
test_x_df = test_x_df.drop('Ticket', axis = 1)
test_x_df = test_x_df.drop('Cabin', axis = 1)
test_x_df = test_x_df.drop('Parch', axis = 1)
test_x_df = test_x_df.drop('SibSp', axis = 1)

test_y_df = test_y_df.drop('PassengerId', axis = 1)

### Check output class balance

In [941]:
train_df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

### Drop nan and filled with average

Fill nan Age and Fare with their average.  
Convert categorical variable "Embarked" into dummy/indicator variables.

In [942]:
try:
    age_mean = train_df['Age'].mean()
    train_df['Age'] = train_df['Age'].fillna(age_mean)
    fare_mean = train_df['Fare'].mean()
    train_df['Fare'] = train_df['Fare'].fillna(fare_mean)
    train_df = pd.get_dummies(data=train_df,columns=["Embarked" ])
    
except Exception:
    pass

try:
    age_mean = test_x_df['Age'].mean()
    test_x_df['Age'] = test_x_df['Age'].fillna(age_mean)
    fare_mean = test_x_df['Fare'].mean()
    test_x_df['Fare'] = test_x_df['Fare'].fillna(fare_mean)
    test_x_df = pd.get_dummies(data=test_x_df,columns=["Embarked" ])
    
except Exception:
    pass

print('train: Null count')
print(train_df.isnull().sum())
print('-----')
print('test: Null count')
print(test_x_df.isnull().sum())

train: Null count
Survived      0
Pclass        0
Sex           0
Age           0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64
-----
test: Null count
Pclass        0
Sex           0
Age           0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64


### Gender mapping

In [943]:
train_df['Sex']= train_df['Sex'].map({'female':0, 'male': 1}).astype(int)
test_x_df['Sex']= test_x_df['Sex'].map({'female':0, 'male': 1}).astype(int)

### Split data into input X and output Y

In [944]:
# Convert np.int64 datatype into np.float64 type for later use
train_df = train_df.astype(np.float64)
test_x_df = test_x_df.astype(np.float64)
test_y_df = test_y_df.astype(np.float64)

train_x_df = train_df.drop(columns=['Survived'])
train_y_df = train_df['Survived']
test_y_df = test_y_df['Survived'] # !-- Convert from dataframe into series type --!

### Preview data

In [945]:
train_x_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3.0,1.0,22.0,7.25,0.0,0.0,1.0
1,1.0,0.0,38.0,71.2833,1.0,0.0,0.0
2,3.0,0.0,26.0,7.925,0.0,0.0,1.0
3,1.0,0.0,35.0,53.1,0.0,0.0,1.0
4,3.0,1.0,35.0,8.05,0.0,0.0,1.0


In [946]:
train_y_df.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

In [947]:
test_y_df.head()

0    0.0
1    1.0
2    0.0
3    0.0
4    1.0
Name: Survived, dtype: float64

### Normalize

<font color=#ff0000>One should apply the exact same scaling for testing data as for the training data. Therefore, we use training data to fit scaler and use the specific scaler to transform testing data.</font>

In [948]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x_df)

normalized_train_x_df = scaler.transform(train_x_df)
normalized_train_x_df = pd.DataFrame(normalized_train_x_df)

normalized_test_x_df = scaler.transform(test_x_df)
normalized_test_x_df = pd.DataFrame(normalized_test_x_df)

In [949]:
normalized_train_x_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.827377,0.737695,-0.592481,-0.502445,-0.482043,-0.307562,0.619306
1,-1.566107,-1.355574,0.638789,0.786845,2.074505,-0.307562,-1.61471
2,0.827377,-1.355574,-0.284663,-0.488854,-0.482043,-0.307562,0.619306
3,-1.566107,-1.355574,0.407926,0.42073,-0.482043,-0.307562,0.619306
4,0.827377,0.737695,0.407926,-0.486337,-0.482043,-0.307562,0.619306


In [950]:
train_y_df.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

In [951]:
normalized_test_x_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.827377,0.737695,0.369449,-0.490783,-0.482043,3.251373,-1.61471
1,0.827377,-1.355574,1.331378,-0.507479,-0.482043,-0.307562,0.619306
2,-0.369365,0.737695,2.485693,-0.453367,-0.482043,3.251373,-1.61471
3,0.827377,0.737695,-0.207709,-0.474005,-0.482043,-0.307562,0.619306
4,0.827377,-1.355574,-0.592481,-0.401017,-0.482043,-0.307562,0.619306


In [952]:
test_y_df.head()

0    0.0
1    1.0
2    0.0
3    0.0
4    1.0
Name: Survived, dtype: float64

### Logistic Regression

1.In the case below, cross-validation of 10-fold is applied.


2.scikit learn documentation：
*    For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
*    For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss
*    ‘liblinear’ is limited to one-versus-rest schemes.
*    ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty
*    ‘liblinear’ and ‘saga’ also handle L1 penalty
*    ‘saga’ also supports ‘elasticnet’ penalty
*    ‘liblinear’ does not handle no penalty

Therefore, we use liblinear in this case.

In [954]:
from sklearn.linear_model import LogisticRegressionCV

lr_model = LogisticRegressionCV(cv = 10,solver='liblinear') # !--- Initialize the model here ---!
lr_model.fit(normalized_train_x_df, train_y_df) # !-- Fill the training data here --!

print('training accuracy:')
# !-- Predict training target & print the training accuracy here --!
lr_predict_train_result = lr_model.predict(normalized_train_x_df)
lr_training_acc = np.mean(lr_predict_train_result == train_y_df)
print(lr_training_acc)

print('\ntesting accuracy:')
# !-- Predict testing target & print the testing accuracy here --!
lr_predict_test_result = lr_model.predict(normalized_test_x_df)
lr_testing_acc = np.mean(lr_predict_test_result == test_y_df)
print(lr_testing_acc)

training accuracy:
0.7934904601571269

testing accuracy:
0.9521531100478469
