In [100]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('my_test.csv')
train_len = len(train_data)

df = pd.concat([train_data, test_data])
df['Family'] = df['SibSp'] + df['Parch']
df['Female'] = (df['Sex'] == 'female').astype(int)
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.').fillna('NA')
df['Age'] = df.groupby(['Title'])['Age'].transform(lambda g: g.fillna(g.median()))
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Fare'] = df.groupby('Ticket')['Fare'].transform(lambda x: x / len(x))
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Boy'] = ((df['Age'] < 14) & (df['Sex'] == 'male')).astype(int)
df['Group'] = df['Pclass'].astype(str) + df['Embarked'] + df['Fare'].astype(str)
df['Age'] = pd.cut(df['Age'], bins=[0,9,20,30,100])
df['Family'] = pd.cut(df['Family'], bins=[0,1,6,20])
df['Fare'] = pd.cut(df['Fare'], bins=[0,10,20,30,100])
df = pd.get_dummies(df, columns=['Pclass','Embarked','Age','Fare','Family','Group'])
df = df.drop(['PassengerId','Name','Title','Cabin','SibSp','Parch','Sex','Ticket'], axis=1)

y_train = df[:train_len]['Survived']
y_test = df[train_len:]['Survived'] #@!
X_train = df[:train_len].drop(['Survived'], axis=1)
X_test = df[train_len:].drop(['Survived'], axis=1)

m = LogisticRegression()
m.fit(X_train, y_train)

# out = pd.DataFrame()
# out['PassengerId'] = test_data.PassengerId
# out['Survived'] = m.predict(X_test).astype(int)
# out.to_csv('my_submission.csv', index=False)

# cross_val_score(m, X_train, y_train, scoring='accuracy', cv=3).mean()
accuracy_score(y_test, m.predict(X_test)) #@!

sorted(zip(X_train.columns.values, np.abs(m.coef_[0])), key=lambda x: x[1], reverse=True)

[('Female', 2.9686233130751125),
 ('Group_3S7.061975', 2.2956431248700255),
 ('Boy', 2.057786355175654),
 ('Group_3S4.4678571428571425', 1.422662205527111),
 ('Group_3S7.925', 1.3589838358236395),
 ('Group_3C7.2271', 1.28999752308459),
 ('Family_(6, 20]', 1.284229321718291),
 ('Group_3S4.6499999999999995', 1.199210014105573),
 ('Group_3Q4.854166666666667', 1.1490654450958007),
 ('Group_3S4.215', 1.1330620908033762),
 ('Group_1S25.258333333333336', 1.113459654116624),
 ('Pclass_3', 1.0764167495050418),
 ('Group_1S26.2875', 1.0711338312142995),
 ('Group_3S7.7958', 1.046288153973729),
 ('Group_3C7.22915', 1.032595515723183),
 ('Group_3S7.8958', 1.0323850132899581),
 ('Group_1S30.5', 1.0022833414375714),
 ('Group_3S5.0933399999999995', 0.9722989627997435),
 ('Group_3S6.614583333333333', 0.9400099689175823),
 ('Group_3S6.875', 0.9380127616819421),
 ('Group_1C27.7208', 0.9264563626312045),
 ('Group_1C28.7125', 0.9085405447127065),
 ('Group_1S26.2771', 0.9077748964495459),
 ('Group_3C7.87085'

In [88]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

def extract_features(data):   
    X = data[['Name','Pclass','Age','Sex','Fare','Ticket','Embarked','SibSp','Parch']].copy()
    X['Family'] = X['SibSp'] + X['Parch']
    X['Female'] = X['Sex'].map({'male':0,'female':1})
    X['Fare'] = X['Fare'].fillna(X['Fare'].median())
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
    X['Title'] = X['Name'].str.extract('([A-Za-z]+)\.').fillna('NA')
    X['Title'] = X['Title'].replace(['Major','Col','Sir','Don','Jonkheer','Capt','Rev','Dr'], 'Mr') 
    X['Title'] = X['Title'].replace(['Lady','Countess','Dona'], 'Mrs')
    X['Title'] = X['Title'].replace(['Mlle','Mme','Ms'], 'Miss')
    X['Age'] = X.groupby(['Title'])['Age'].transform(lambda g: g.fillna(g.median()))
    X['Fare'] = X.groupby('Ticket')['Fare'].transform(lambda x: x / len(x))
    X['Boy'] = ((X['Age'] < 16) & (X['Sex'] == 'male') & (X['Family'] > 0)).astype(int)
    # X['Alone'] = (X['Family'] == 0).astype(int)
    X['Group'] = X['Pclass'].astype(str) + X['Ticket'].str[:-1] + X['Embarked'] + X['Fare'].astype(str)
    # X['Group'] = X.groupby('Group')['Group'].transform(lambda x: '-' if len(x) < 2 else x)
    X.loc[X['Family'] == 1, 'Group'] = '-'
    X['Age'] = pd.cut(X['Age'], bins=[0,9,16,30,100])
    X['Family'] = pd.cut(X['Family'], bins=[0,1,6,20])
    X['Fare'] = pd.cut(X['Fare'], bins=[0,10,20,30,100])
    X = pd.get_dummies(X, columns=['Pclass','Embarked','Age','Fare','Family','Group'])
    return X.drop(['Name','Ticket','Sex','SibSp','Title','Parch'], axis=1)

def align_features(X_train, X_test):
    return X_train.align(X_test, join='outer', axis=1, fill_value=0)

def extract_target(data):
    return data['Survived'].copy()

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('my_test.csv') #@!

y_train = extract_target(train_data)
y_test = extract_target(test_data) #@!
X_train = extract_features(train_data)
X_test = extract_features(test_data)
X_train, X_test = align_features(X_train, X_test)

m = LogisticRegression()
# m = KNeighborsClassifier(n_neighbors=12)
# m = RandomForestClassifier(n_estimators=300)
m.fit(X_train, y_train)

# out = pd.DataFrame()
# out['PassengerId'] = test_data.PassengerId
# out['Survived'] = m.predict(X_test)
# out.to_csv('my_submission2.csv', index=False)

# cross_val_score(m, X_train, y_train, scoring='accuracy', cv=3).mean()
accuracy_score(y_test, m.predict(X_test)) #@!

0.8040201005025126