In [139]:
# https://www.kaggle.com/c/titanic

In [140]:
# The training set should be used to build your machine learning models.
train = read.csv('train.csv', stringsAsFactors=F, header=T)

# The test set should be used to see how well your model performs on unseen data.
test = read.csv('test.csv', stringsAsFactors=F, header=T)

In [141]:
# Markdown whether if the data is training data / test data
train$isTrain = T
test$isTrain = F

In [142]:
# Create default $Survived column
test$Survived = NA

In [143]:
# Merge 2 data set, by performing a vertical join
full = rbind(train, test)

In [144]:
# Some empty value were found in $Embarked, we will have to clean it by setting it to S
full$Embarked[which(full$Embarked == "")] = "S"

In [145]:
# 1 missing value in $Fare, replace it with the median value of $Fare
full$Fare[which(is.na(full$Fare))] = median(full$Fare, na.rm=T)

# Median were used because there were only 1 missing value
# A model can also be used but it would be a waste

# For how to build a model & predict, see the cell below

In [146]:
# 263 missing value in $Age, we will have to replace it
# table(is.na(full$Age))
# full$Age[which(is.na(full$Age))] = median(full$Age, na.rm=T) # Replace with the median value

# Predict the missing values with lm()
lowerWhisker = boxplot.stats(full$Age)$stats[1]
upperWhisker = boxplot.stats(full$Age)$stats[5]
outlierFilter = full$Age > lowerWhisker
outlierFilter = full$Age < upperWhisker

ageEquation = "Age ~ Pclass + Sex + Fare + SibSp + Parch + Embarked"
ageModel = lm(
	formula = ageEquation,
	data = full[outlierFilter,]
)

ageRow = full[
	is.na(full$Age),
	c("Pclass", "Sex", "Fare", "SibSp", "Parch", "Embarked")
]

agePredictResult = predict(ageModel, newdata=ageRow)
agePredictResult = ceiling(agePredictResult / 0.5) * 0.5

full[is.na(full$Age), "Age"] = agePredictResult
# But there's a -1.5 years old ?...

In [147]:
# Categorical casting
full$Pclass <- as.factor(full$Pclass)
full$Sex <- as.factor(full$Sex)
full$Embarked <- as.factor(full$Embarked)

In [148]:
# Return the data
train = full[full$isTrain == T,]
test = full[full$isTrain == F,]

In [149]:
train$Survived = as.factor(train$Survived)

formula = as.formula(
	"Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked"
)

In [150]:
# --- Model & Predict

In [151]:
# Using lm()
# lm() shouldn't be used?

# model = lm(formula, train)

# result = predict(model, test)
# result = predict.lm(model, newdata=test)

In [152]:
# Using randomForest
library(randomForest)

model = randomForest(
    formula,
    train
    # formula=formula,
    # newdata=train,
    # ntree=500,
    # mtry=3,
    # nodesize=nrow(test) * 0.01
)
result = predict(model, test)

result
# The values in result should only be either 0 or 1

In [153]:
# help(randomForest)

# Increase the accuracy
# "If you want to increase the accuracy of your model, increase the number of trees."

# From:
# https://www.keboola.com/blog/random-forest-regression

In [159]:
PassengerId = test$PassengerId
outputDF = as.data.frame(PassengerId)

outputDF$Survived = result

outputDF

PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
901,0


In [163]:
write.csv(outputDF, file="predict.csv", row.names=F)