In [83]:
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidytext)
library(caret)
library(glmnet)
library(xgboost)
library(doMC)
library(stringr)
library(Matrix)

In [84]:
df_train <- read_csv("../input/train.csv")
df_test <- read_csv("../input/test.csv")

Parsed with column specification:
cols(
  PassengerId = col_double(),
  Survived = col_double(),
  Pclass = col_double(),
  Name = col_character(),
  Sex = col_character(),
  Age = col_double(),
  SibSp = col_double(),
  Parch = col_double(),
  Ticket = col_character(),
  Fare = col_double(),
  Cabin = col_character(),
  Embarked = col_character()
)
Parsed with column specification:
cols(
  PassengerId = col_double(),
  Pclass = col_double(),
  Name = col_character(),
  Sex = col_character(),
  Age = col_double(),
  SibSp = col_double(),
  Parch = col_double(),
  Ticket = col_character(),
  Fare = col_double(),
  Cabin = col_character(),
  Embarked = col_character()
)


In [98]:
summary(df_train)
dim(df_train)
dim(df_test)

  PassengerId       Survived      Pclass      Name               Sex     
 Min.   :  1.0   Min.   :0.0000   1:216   Length:891         female:314  
 1st Qu.:223.5   1st Qu.:0.0000   2:184   Class :character   male  :577  
 Median :446.0   Median :0.0000   3:491   Mode  :character               
 Mean   :446.0   Mean   :0.3838                                          
 3rd Qu.:668.5   3rd Qu.:1.0000                                          
 Max.   :891.0   Max.   :1.0000                                          
                                                                         
      Age            SibSp           Parch           Ticket         
 Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
 1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
 Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
 Mean   :29.70   Mean   :0.523   Mean   :0.3816                     
 3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000                

In [86]:
df_train$Cabin_letter <- ifelse(is.na(df_train$Cabin), "NA", str_sub(df_train$Cabin, 1, 1))
df_train <- replace_na(df_train, list(Pclass = "NA", Sex = "NA", Cabin = "NA", Embarked = "NA"))

In [87]:
factorize <- function(df, var_list) {
    for (var in var_list) {
        df[[var]] <- factor(df[[var]])
    }
    return(df)
}

df_train <- factorize(df_train, list("Pclass", "Sex", "Cabin", "Embarked"))


In [88]:
categorical_to_lmfit <- function(varName, y_train, df, alpha=0, lambda=0) {
    
    y_train <- factor(y_train, label = c("N", "Y"))    
    trCon <- trainControl(method = "cv", number = 5, returnData = FALSE, 
                      savePredictions = "final", classProbs = TRUE)
    glmnet_grid <- expand.grid(alpha = alpha, lambda = lambda)
    equation <- as.formula(paste("~", varName))
    dummy <- sparse.model.matrix(equation, data = df)
    lm_fit <- train(dummy, y_train, method = "glmnet", trControl = trCon, tuneGrid = glmnet_grid)
    varName_lmfit <- paste0(varName, "_lmfit")
    df[[varName_lmfit]] <- arrange(lm_fit$pred, rowIndex)$Y    
    return(df)
}


In [89]:
df_train <- categorical_to_lmfit("Pclass", df_train$Survived, df_train)
df_train <- categorical_to_lmfit("Sex", df_train$Survived, df_train)
df_train <- categorical_to_lmfit("Cabin_letter", df_train$Survived, df_train)
df_train <- categorical_to_lmfit("Embarked", df_train$Survived, df_train)

In [99]:
table(df_train$Pclass)
table(df_train$Sex)
table(df_train$Embarked)
table(df_train$Cabin_letter)


  1   2   3 
216 184 491 


female   male 
   314    577 


  C  NA   Q   S 
168   2  77 644 


  A   B   C   D   E   F   G  NA   T 
 15  47  59  33  32  13   4 687   1 

In [7]:
summary(df_train)

  PassengerId       Survived      Pclass      Name               Sex     
 Min.   :  1.0   Min.   :0.0000   1:216   Length:891         female:314  
 1st Qu.:223.5   1st Qu.:0.0000   2:184   Class :character   male  :577  
 Median :446.0   Median :0.0000   3:491   Mode  :character               
 Mean   :446.0   Mean   :0.3838                                          
 3rd Qu.:668.5   3rd Qu.:1.0000                                          
 Max.   :891.0   Max.   :1.0000                                          
                                                                         
      Age            SibSp           Parch           Ticket         
 Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
 1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
 Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
 Mean   :29.70   Mean   :0.523   Mean   :0.3816                     
 3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000                

In [101]:
table(df_train$Ticket)


            110152             110413             110465             110564 
                 3                  3                  2                  1 
            110813             111240             111320             111361 
                 1                  1                  1                  2 
            111369             111426             111427             111428 
                 1                  1                  1                  1 
            112050             112052             112053             112058 
                 1                  1                  1                  1 
            112059             112277             112379             113028 
                 1                  1                  1                  1 
            113043             113050             113051             113055 
                 1                  1                  1                  1 
            113056             113059             113501             113503

In [134]:
train_names <- df_train %>% 
    select_("PassengerId", "Name") %>% 
    unnest_tokens(name, Name) %>% 
    group_by(PassengerId, name) %>% 
    summarize(n = n()) %>% 
    bind_tf_idf(name, PassengerId, n) %>% 
    cast_sparse(PassengerId, name, n)


In [124]:
# text_lm_fit <- glmnet(train_names, factor(train$Survived), family = "binomial", alpha = 0)

text_lm_fit <- train(train_names, factor(df_train$Survived, label=c("N", "Y")), method = "glmnet", 
                     trControl = trainControl(method = "cv", number = 5, returnData = FALSE, 
                                              savePredictions = "final", classProbs = TRUE), 
                     tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 0.2, by = 0.001)))

In [122]:
seq(0, 0.5, by = 0.01)
text_lm_fit$pred

alpha,lambda,pred,obs,rowIndex,N,Y,Resample
1,0.018,N,Y,858,0.7155298,0.2844702,Fold5
1,0.018,N,N,72,0.7452514,0.2547486,Fold2
1,0.018,N,N,842,0.8711009,0.1288991,Fold5
1,0.018,Y,Y,843,0.2182641,0.7817359,Fold5
1,0.018,N,N,869,0.8740628,0.1259372,Fold5
1,0.018,Y,Y,23,0.2044801,0.7955199,Fold4
1,0.018,N,N,884,0.8711009,0.1288991,Fold5
1,0.018,N,N,885,0.8711009,0.1288991,Fold5
1,0.018,Y,Y,888,0.2182641,0.7817359,Fold5
1,0.018,Y,Y,880,0.1966898,0.8033102,Fold5


In [None]:
df_train$text_fitted <- arrange(text_lm_fit$pred, rowIndex)[["Y"]]
# predict(text_lm_fit, train_names, type = "prob")

In [None]:
head(df_train)
str(df_train)

In [None]:
X_train <- df_train %>% 
    select(Age, Fare, SibSp, Parch, Pclass_fitted, Sex_fitted, Cabin_letter_fitted, Embarked_fitted, text_fitted) %>% 
    as.matrix()

In [None]:
xgb_model <- train(X_train, factor(df_train$Survived, label = c("N", "Y")), method = "xgbTree", 
                   trControl = trainControl(method = "cv", number = 5, returnData = FALSE, 
                                              savePredictions = "final", classProbs = TRUE), 
                     tuneGrid = expand.grid(nrounds = c(100, 500, 1000, 2000), 
                                           max_depth = 6, eta = 0.3, gamma = 0, colsample_bytree = 1, 
                                           min_child_weight = 1, subsample = 1))

In [None]:
1 + 1

In [None]:
lapply(df_train, function(x){sum(is.na(x))})

In [None]:
registerDoMC(cores = detectCores() - 1)