In [54]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data_cleaner = [train, test]

split_index = len(train)
Id = test.PassengerId
target = train.Survived

cat_features = []
num_features = []
ord_features = []
bool_features = []

In [3]:
for d in data_cleaner:
    d.loc[d.Fare > 300, "Fare"] = 300

In [4]:
for data in data_cleaner:
    data["Family"] = data.Parch + data.SibSp + 1
    data["IsAlong"] = data.Family.map(lambda x: 1 if x == 1 else 0)
    data["IsMarige"] = data.Parch.map(lambda x: 1 if x != 0 else 0)
    data["Title"] = data.Name.map(lambda x: x.split(",")[1].split(".")[0].strip())
    data["Title_Bin"] = data.Title.replace(["Capt", "Col", "Don", "Dr", "Jonkheer", "Lady", "Major", "Mlle", "Mme",
                                           "Ms", "Rev", "Sir", "the Countess", "Dona"], "Other")
    data["Cabin"] = data.Cabin.fillna("N")
    data["IsCabin"] = data.Cabin.map(lambda x: 1 if x != "N" else 0)
    data["Ticket_L"] = data.Ticket.map(lambda x: x[0])
    data["Ticket_L_Bin"] = data.Ticket_L.replace(["4","5","6","7","8","9","A","F","L","W","C","S"], "new1").replace("P","1")

In [5]:
train.Embarked = train.groupby(["IsAlong", "Survived", "Pclass", "Sex", "Title_Bin", "Ticket_L_Bin"])["Embarked"].apply(lambda x: x.fillna(x.mode()[0]))
test.Fare = test.groupby(["IsAlong", "Pclass", "Sex", "Title_Bin", "Ticket_L_Bin"])["Fare"].apply(lambda x: x.fillna(x.median()))
for data in data_cleaner:
    data["Age"] = data.groupby(["IsAlong", "Pclass", "Sex", "Title_Bin", "Ticket_L_Bin", "Embarked"])["Age"].apply(lambda x: x.fillna(x.mean()+np.random.normal(0, 2.0)))
    data["Age"] = data.groupby(["IsAlong", "Pclass", "Sex", "Ticket_L_Bin"])["Age"].apply(lambda x: x.fillna(x.mean()+np.random.normal(0, 2.0)))
    data["Age"] = data.groupby(["IsAlong", "Sex"])["Age"].apply(lambda x: x.fillna(x.mean()+np.random.normal(0, 1.5)))
    
    data["Fare_Log"] = np.log1p(data["Fare"]).astype("float")

In [6]:
for data in data_cleaner:
    data["Fare_Bin"] = pd.qcut(data.Fare, q=5, labels=range(1,6))
    data["Fare_Log_Bin"] = pd.cut(data.Fare_Log, bins=4, labels=range(1,5))
    data["Age_Bin"] = pd.cut(data.Age, bins=4, labels=range(1,5))

In [7]:
for data in data_cleaner:
    data["Family_Bin"] = data.Family.replace(1,"a").replace([2,3,4],"b").replace([5,6,7,8,11],"c")

In [8]:
titanic = pd.concat([train, test], axis=0)
target_feature = "Survived"
titanic.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin", "Title", 'Ticket_L'], axis=1, inplace=True)

In [9]:
for col in titanic.columns:
    if titanic[col].dtypes == "object":
        titanic[col] = titanic[col].astype("category")
        cat_features.append(col)
    elif col in ["Pclass", "Fare_Bin", "Fare_Log_Bin", "Age_Bin"]:
        titanic[col] = titanic[col].astype("category")
        ord_features.append(col)
    elif data[col].nunique()==2:
        bool_features.append(col)
    else:
        num_features.append(col)

In [10]:
labler = LabelEncoder()
ohe = OneHotEncoder()

In [11]:
ohe_features = pd.get_dummies(titanic[cat_features], drop_first=True)
order_features = titanic[ord_features]
bool_features = titanic[bool_features]
num_features = titanic[num_features]

In [43]:
BIG = pd.concat([bool_features, order_features, num_features, titanic[cat_features]], axis=1)

In [13]:
def get_best_features(X, clf = RandomForestClassifier(), best_features = None, best_score = 0.6):
    
    for i in range(len(X.columns)):
        results = pd.DataFrame()
        for col in X.columns:

            df = pd.DataFrame()

            if not best_features:
                best_features = []
                if col in cat_features:
                    df = pd.get_dummies(X[col], drop_first=True)
                    x = MinMaxScaler().fit_transform(df)
                    print(x.shape)
                    results[col] = cross_val_score(clf, x, target, cv=10, scoring="accuracy")
                elif col not in cat_features:
                    df = X[col]
                    x = MinMaxScaler().fit_transform(np.array(df).reshape((split_index, -1)))
                    print(x.shape)
                    results[col] = cross_val_score(clf, x, target, cv=10, scoring="accuracy")

            elif best_features:
                X1 = X[np.append(best_features, col)]
                best_cats = []
                best_others = []
                for best in best_features:
                    if best in cat_features:
                        best_cats.append(best)
                    elif best not in cat_features:
                        best_others.append(best)
                if col in best_features:
                        continue
                if col in cat_features:
                    df = pd.get_dummies(data=X1, columns=np.append(col, best_cats), drop_first=True)
                    x = MinMaxScaler().fit_transform(df)
                    print(x.shape)
                    results[col] = cross_val_score(clf, x, target, cv=10, scoring="accuracy")
                elif col not in cat_features:
                    df = pd.get_dummies(data = X1, columns=best_cats, drop_first=True)
                    x = MinMaxScaler().fit_transform(df)
                    print(x.shape)
                    results[col] = cross_val_score(clf, x, target, cv=10, scoring="accuracy")
        print(results.mean().sort_values())
        if results.mean().sort_values().iloc[-1] > best_score:
            best_score = results.mean().sort_values().iloc[-1]
            print(best_score)
            print(results.mean().sort_values().index[-1])
            print()
        elif results.mean().sort_values().iloc[-1] <= best_score:
            break
        best_features.append(results.mean().sort_values().index[-1])
    return best_features

In [14]:
get_best_features(BIG.iloc[:split_index], clf=RandomForestClassifier())

(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 1)
(891, 2)
(891, 4)
(891, 3)
(891, 2)
IsAlong         0.597203
Age_Bin         0.616167
IsMarige        0.621835
Parch           0.621835
SibSp           0.629713
Age             0.632946
Embarked        0.638639
Family          0.667890
Family_Bin      0.667890
Fare_Log_Bin    0.668976
Pclass          0.669001
Fare_Bin        0.672397
Ticket_L_Bin    0.680187
IsCabin         0.692509
Fare            0.694732
Fare_Log        0.694744
Title_Bin       0.785643
Sex             0.786729
dtype: float64
0.786729088639201
Sex

(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 2)
(891, 3)
(891, 5)
(891, 4)
(891, 3)
Pclass          0.774444
Fare_Bin        0.780062
Age             0.783408
IsAlong         0.786729
IsMarige        0.786729
IsCabin         0.786729
Fare_Log_Bin    0.786729
Age_Bin         0.7867

['Sex', 'Fare', 'Pclass', 'Age', 'IsCabin', 'Title_Bin']

In [44]:
BIG.drop(["Fare_Bin", "Fare_Log_Bin", "Age_Bin",
          "SibSp", "Parch", "Fare", "Family",
          "IsMarige", "IsCabin", "IsAlong"], axis=1, inplace=True)

In [49]:
X = pd.get_dummies(BIG, columns=["Sex", "Embarked", "Title_Bin", "Ticket_L_Bin", "Family_Bin"], drop_first=True)
X_TRAIN = MinMaxScaler().fit_transform(X.iloc[:split_index])
X_TEST = MinMaxScaler().fit_transform(X.iloc[split_index:])

In [58]:
xgb = XGBClassifier()
p = {"n_estimators":range(20,140,7),
    "max_depth":range(3, 16,2),
    "learning_rate":[0.01, 0.02, 0.03, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2],
    "gamma":[0, 0.5, 1.0, 1.5, 2.0],
    "reg_alpha":[0, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0, 1.1], 
     "reg_lambda":[0, 0.1, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0, 1.1]}
grid = RandomizedSearchCV(xgb, p, cv=5, n_jobs=-1, n_iter=700, scoring="accuracy")
st = datetime.now()
grid.fit(X_TRAIN, target)
print(datetime.now()-st)
print(grid.best_score_)
print(grid.best_params_)
best_xgb = grid.best_estimator_

0:01:45.202516
0.8406503044378884
{'reg_lambda': 0.5, 'reg_alpha': 0.2, 'n_estimators': 132, 'max_depth': 3, 'learning_rate': 0.15, 'gamma': 0}


In [59]:
pd.DataFrame({"PassengerId":Id, "Survived":best_xgb.predict(X_TEST)}).set_index("PassengerId").to_csv("suka_sub_ebal.csv")