In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from IPython.display import display
from IPython.display import display_html

from keras.models import Sequential
from keras.layers import Dense, Dropout

%matplotlib inline
import matplotlib.pyplot as plt

def show_train_history(train_history, train, validation):
    plt.figure(figsize=(20,10))
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel('train')
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='center right')
    plt.show()


def display_side_by_side(*args):
    html_str = ""
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline"'), raw=True)

def check_null(df):
    print(df.isnull().sum())
    
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

    
    
train_df = pd.read_csv('文件/titanic/train.csv')
test_df = pd.read_csv('文件/titanic/test.csv')

train_df['Sex'] = train_df['Sex'].map({'female':0, 'male':1})
test_df['Sex'] = test_df['Sex'].map({'female':0, 'male':1})
# cols = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 
#         'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# train_df.add(survive_column)
# test_df['Survived'] = ""

frames = [test_df[['Fare', 'Pclass']], train_df[['Fare', 'Pclass']]]
full_df = pd.concat(frames)
s = full_df[full_df['Pclass'] == 3]
median_class_3_fare = s['Fare'].median()
median_class_1_fare = s['Fare'].median()

test_df['Fare'] = test_df['Fare'].fillna(median_class_3_fare)
test_df['Fare'] = test_df['Fare'].replace(0, median_class_1_fare)


fra = [train_df['Ticket'], test_df['Ticket']]
full_ticket = pd.concat(fra)

train_df['Accompanied'] = 0
train_df['Friend_number'] = 0
test_df['Accompanied'] = 0
test_df['Friend_number'] = 0
# ticket_list = list(train_df['Ticket'])
# train_df.columns
# for index, row in train_df.iterrows():
#     if ticket_list.count(row['Ticket']) > 1:
#         train_df.iat[index, 12] = 1
#         train_df.iat[index, 13] = ticket_list.count(row['Ticket']) - 1
ticket_list = list(full_ticket)
ticket_set = set(ticket_list)

for index, row in train_df.iterrows():
    if ticket_list.count(row['Ticket']) > 1:
        train_df.iat[index, 12] = 1
        train_df.iat[index, 13] = ticket_list.count(row['Ticket']) - 1


for index, row in test_df.iterrows():
    if ticket_list.count(row['Ticket']) > 1:
        test_df.iat[index, 11] = 1
        test_df.iat[index, 12] = ticket_list.count(row['Ticket']) - 1


train_df['Fare_bin_6'] = pd.qcut(train_df['Fare'], 6)
test_df['Fare_bin_6'] = pd.qcut(test_df['Fare'], 6)

label = LabelEncoder()
train_df['FareBin_Code_6'] = label.fit_transform(train_df['Fare_bin_6'])
test_df['FareBin_Code_6'] = label.fit_transform(test_df['Fare_bin_6'])


train_df['Df'] = 'Train'
test_df['Df'] = 'Test'
train_df['Ticket_not_in_test'] = 0

# create not_in_test_ticket_list
ticket_list_test = set(test_df.Ticket)
ticket_list_train = set(train_df.Ticket)
print(len(ticket_list_train))
print(len(ticket_list_test))
ticket_diff = ticket_list_train.difference(ticket_list_test)
# train.shape #891
# train_df[train_df.Ticket.isin(ticket_diff)]] = 1
train_df.loc[train_df.Ticket.isin(ticket_diff), ['Ticket_not_in_test']] = 1

train_df.loc[train_df['Name'].str.contains('Don\.'), 'Title'] = 'Mr'
train_df.loc[train_df['Name'].str.contains('Jonkheer\.'), 'Title'] = 'Mr'
train_df.loc[train_df['Name'].str.contains('Rev\.'), 'Title'] = 'Army'
train_df.loc[train_df['Name'].str.contains('Major\.'), 'Title'] = 'Army'
train_df.loc[train_df['Name'].str.contains('Col\.'), 'Title'] = 'Army'
train_df.loc[train_df['Name'].str.contains('Capt\.'), 'Title'] = 'Army'
train_df.loc[train_df['Name'].str.contains('Mme\.'), 'Title'] = 'Miss'
train_df.loc[train_df['Name'].str.contains('Mrs\.'), 'Title'] = 'Mrs'
train_df.loc[train_df['Name'].str.contains('Mlle\.'), 'Title'] = 'Miss'
train_df.loc[train_df['Name'].str.contains('Mr\.'), 'Title'] = 'Mr'
train_df.loc[train_df['Name'].str.contains('Miss\.'), 'Title'] = 'Miss'
train_df.loc[train_df['Name'].str.contains('Ms\.'), 'Title'] = 'Miss'
train_df.loc[train_df['Name'].str.contains('Master\.'), 'Title'] = 'Master'
train_df.loc[train_df['Name'].str.contains('Dr\.'), 'Title'] = 'Dr'
train_df.loc[train_df['Name'].str.contains('Lady\.'), 'Title'] = 'Noble'
train_df.loc[train_df['Name'].str.contains('Sir\.'), 'Title'] = 'Noble'
train_df.loc[train_df['Name'].str.contains('Countess'), 'Title'] = 'Noble'



test_df.loc[test_df['Name'].str.contains('Rev\.'), 'Title'] = 'Army'
test_df.loc[test_df['Name'].str.contains('Major\.'), 'Title'] = 'Army'
test_df.loc[test_df['Name'].str.contains('Col\.'), 'Title'] = 'Army'
test_df.loc[test_df['Name'].str.contains('Capt\.'), 'Title'] = 'Army'
test_df.loc[test_df['Name'].str.contains('Mrs\.'), 'Title'] = 'Mrs'
test_df.loc[test_df['Name'].str.contains('Mr\.'), 'Title'] = 'Mr'
test_df.loc[test_df['Name'].str.contains('Miss\.'), 'Title'] = 'Miss'
test_df.loc[test_df['Name'].str.contains('Ms\.'), 'Title'] = 'Miss'
test_df.loc[test_df['Name'].str.contains('Master\.'), 'Title'] = 'Master'
test_df.loc[test_df['Name'].str.contains('Dr\.'), 'Title'] = 'Dr'
test_df.loc[test_df['Name'].str.contains('Countess'), 'Title'] = 'Noble'
test_df.loc[test_df['Name'].str.contains('Dona\.'), 'Title'] = 'Noble'


train_df['Is_husband'] = 0
mask = (train_df.Ticket_not_in_test > 0) & (train_df.SibSp > 0) & (train_df.Name.str.contains("Mr\.")) & (train_df.Accompanied > 0)
train_df.loc[mask, 'Is_husband'] = 1

test_df['Is_husband'] = 0
mask = (test_df.SibSp > 0) & (test_df.Name.str.contains("Mr\.")) & (test_df.Accompanied > 0)
test_df.loc[mask, 'Is_husband'] = 1

# remove null Embarked (2 rows)
train_df = train_df[train_df.Embarked.notnull()]



######## USE NO TEST TICKET DATA OF TRAIN_DF ########
# train_df = train_df[train_df.Ticket_not_in_test > 0]

#####################################




train_df.columns

In [None]:
print(train_df.shape)

print(test_df.shape)

test_df



In [None]:
# ['PassengerId', 'Fare_bin_6', 'Df', 'Name', 'Survived', 'Age', 'Cabin', 'Fare', 'Ticket_not_in_test', 'SibSp', 'Parch', 
#         'Pclass', 'Sex', 'Friend_number', 'Accompanied', 'FareBin_Code_6', 'Title', 'Is_husband', 'Ticket', 'Embarked']





# train_df = train_df.drop(['Name'], axis=1)
# test_df = test_df.drop(['Name'], axis=1)


#### age stuffs

# frames = [test_df[list(['Age', 'Title', 'SibSp'])], train_df[list(['Age', 'Title', 'SibSp'])]]
# full_df = pd.concat(frames)

# temp = full_df[full_df['Title'] == 'Mrs']
# mrs_age_median = temp['Age'].median()
# mrs_age_mean = temp.Age.mean()
# temp = full_df[full_df['Title'] == 'Mr']
# mr_age_median = temp['Age'].median()
# mr_age_mean = temp['Age'].mean()
# temp = full_df[full_df['Title'] == 'Master']
# master_age_median = temp['Age'].median()
# temp = full_df[(full_df['Title'] == 'Miss') & (full_df.SibSp > 0)]
# young_miss_age_median = temp['Age'].median()
# young_miss_age_mean = temp['Age'].mean()
# temp = full_df[(full_df['Title'] == 'Miss') & (full_df.SibSp < 1)]
# other_miss_age_median = temp['Age'].median()
# other_miss_age_mean = temp['Age'].mean()

# iii = list(['Age', 'Title'])
# x_df = full_df[iii]
# rest_age_median = x_df[(x_df.Title != "Mr") & (x_df.Title != "Mrs") & (x_df.Title != "Miss") & (x_df.Title != "Master")]['Age'].median()

# # train_df['Age'] = train_df['Age'].fillna(age_mean)
# akk = train_df[(train_df.Age.isnull()) & (train_df.Title == "Mr")]
# akk = akk.fillna(mr_age_median)

# tti = train_df
# tti[(tti.Age.isnull()) & (tti.Title == "Mr")].fillna(mr_age_median, inplace=True)
# # tti[(tti.Age.isnull()) & (tti.Title == "Mr")]
# tti


# train_df[(train_df.Age.isnull()) & (train_df.Title == "Mr")] = train_df[(train_df.Age.isnull()) & (train_df.Title == "Mr")].fillna(mr_age_median)
# train_df[(train_df.Age.isnull()) & (train_df.Title == "Mrs")] = train_df[(train_df.Age.isnull()) & (train_df.Title == "Mrs")].fillna(mrs_age_median)
# train_df[(train_df.Age.isnull()) & (train_df.Title == "Master")] = train_df[(train_df.Age.isnull()) & (train_df.Title == "Master")].fillna(master_age_median)
# train_df[(train_df.Age.isnull()) & (train_df.Title == "Miss") & (train_df.SibSp > 0)] = train_df[(train_df.Age.isnull()) & (train_df.Title == "Miss") & (train_df.SibSp > 0)].fillna(young_miss_age_median)
# train_df[(train_df.Age.isnull()) & (train_df.Title == "Miss") & (train_df.SibSp < 1)] = train_df[(train_df.Age.isnull()) & (train_df.Title == "Miss") & (train_df.SibSp < 1)].fillna(other_miss_age_median)
# train_df[(train_df.Age.isnull())] = train_df[(train_df.Age.isnull())].fillna(rest_age_median)

# test_df[(test_df.Age.isnull()) & (test_df.Title == "Mr")] = test_df[(test_df.Age.isnull()) & (test_df.Title == "Mr")] .fillna(mr_age_median)
# test_df[(test_df.Age.isnull()) & (test_df.Title == "Master")] = test_df[(test_df.Age.isnull()) & (test_df.Title == "Master")] .fillna(master_age_median)
# test_df[(test_df.Age.isnull()) & (test_df.Title == "Mrs")] = test_df[(test_df.Age.isnull()) & (test_df.Title == "Mrs")] .fillna(mrs_age_median)
# test_df[(test_df.Age.isnull()) & (test_df.Title == "Miss") & (test_df.SibSp > 0)] = test_df[(test_df.Age.isnull()) & (test_df.Title == "Miss") & (test_df.SibSp > 0)].fillna(young_miss_age_median)
# test_df[(test_df.Age.isnull()) & (test_df.Title == "Miss") & (test_df.SibSp < 1)] = test_df[(test_df.Age.isnull()) & (test_df.Title == "Miss") & (test_df.SibSp < 1)].fillna(other_miss_age_median)
##### end of age stuffs
# ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
#        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Accompanied',
#        'Friend_number', 'Fare_bin_6', 'FareBin_Code_6', 'Df',
#        'Ticket_not_in_test', 'Title', 'Is_husband']


test_df['Survived'] = 0
train_df = train_df.drop("Ticket_not_in_test", axis=1)
cols = list(train_df.columns)
cols.sort()
test_df = test_df[cols]
train_df = train_df[cols]
fra = [train_df, test_df]
full_df = pd.concat(fra)

full_df = full_df.drop('Name', axis=1)
full_df = full_df.drop('Age', axis=1)
full_df = full_df.drop('Cabin', axis=1)
full_df = full_df.drop('Fare_bin_6', axis=1)




OneHot_full = pd.get_dummies(data=full_df, columns=['Embarked', 'Title', 'Ticket'])


x_OneHot_df = OneHot_full[OneHot_full.Df == "Train"]
x_OneHot_test_df = OneHot_full[OneHot_full.Df == "Test"]

x_OneHot_df = x_OneHot_df.drop("Df", axis=1)
x_OneHot_test_df = x_OneHot_test_df.drop("Df", axis=1)

### temp out
# x_OneHot_df = pd.get_dummies(data=train_df, columns=['Embarked', 'Title', 'Ticket'])
# x_OneHot_test_df = pd.get_dummies(data=test_df, columns=['Embarked', 'Title', 'Ticket'])




cols = list(x_OneHot_df.columns)
cols.sort()
cols.remove('Survived')
cols.insert(0, 'Survived')

# print(x_OneHot_df.columns)
# print(x_OneHot_test_df.columns)




x_OneHot_df = x_OneHot_df[cols]
x_OneHot_test_df = x_OneHot_test_df[cols]

# train_df.columns

In [None]:
# 1636
# 1317
print(x_OneHot_df.shape)
print(x_OneHot_test_df.shape)

In [None]:

# print(x_OneHot_df.columns)

# print(x_OneHot_test_df.columns)

nd_array = x_OneHot_df.values
test_nd_array = x_OneHot_test_df.values

train_Label = nd_array[:, 0]
train_Feature = nd_array[:, 1:]
test_Feature = test_nd_array[:, 1:]



minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled_train_features = minmax_scale.fit_transform(train_Feature)
scaled_test_features = minmax_scale.fit_transform(test_Feature)

# print(scaled_train_features[0])
print(scaled_test_features.shape)
print(scaled_train_features.shape)

In [2]:

result_df = test_df

test_result_label = model.predict_classes(scaled_test_features)
result_df['Survived'] = test_result_label
cols = list(result_df)
cols.insert(0, cols.pop(cols.index('Survived')))
result_df = result_df.loc[:, cols]
result_df.rename(columns=result_df.iloc[0])

col = list(['PassengerId', 'Survived'])
result_df = result_df[col]
result_df.to_csv('/titanic/result.csv', index = False)
# result_df
# print(cols)

NameError: name 'test_df' is not defined