# Import
get libraries that we are going to use throughout this project

In [1]:
#import needed libraries
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Read Train and Test Data
Read the train data int TrainDF and the test data to TestDF
then show the first five rows of each of them to make sure of the data
then show some statistics of each data frame

In [2]:
#read the train data
TrainDF = pd.read_csv("train.csv")
#show the first 5 rows of the data frame
TrainDF.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#get some statistics of the train data frame
TrainDF.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#read the test data
TestDF = pd.read_csv("test.csv")
#show the first 5 rows of the data frame
TestDF.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#get some statistics of the test data frame
TestDF.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


# check null vals
check if there is null values in the train or test data 

In [6]:
#check if th train data have null vals
TrainDF.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
#check if th test data have null vals
TestDF.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Fill null vals
After checking on null values in the train and test data we have found that there are some null values in the Age and Fare columns. so, we will fill the empty cells in the Age and Fare columns of the train and test data with the median value of each column.

In [8]:
# fill the empty data in age and fare of train dataset
TrainDF["Age"] = TrainDF['Age'].fillna(TrainDF['Age'].median())
TrainDF["Fare"] = TrainDF['Fare'].fillna(TrainDF['Fare'].median())

# fill the empty data in age and fare of test dataset
TestDF["Age"] = TestDF['Age'].fillna(TestDF['Age'].median())
TestDF["Fare"] = TestDF['Fare'].fillna(TestDF['Fare'].median())

# Encoding
Replace every male with 0 and every female with 1 in the sex column in both train and test data. Also replace the characters in the embarked column with integers values in train and test data to get better predictions.

In [9]:
#encode sex values as integer
mapping = {'male':0, 'female':1}
TrainDF = TrainDF.replace({'Sex':mapping})
TestDF = TestDF.replace({'Sex':mapping})
#encode embarked values as integer
mapping = {'C':0, 'Q':1, 'S':2}
TrainDF = TrainDF.replace({'Embarked':mapping})
TestDF = TestDF.replace({'Embarked':mapping})

# Train, Validate, and predict
get the target and features and define an appropriate model. split the train data to trainig and validation data for both the target and features. make prediction with the validation data and calculate the Mean Absolute Error in validation data.
fit the data training data and finally predict the test data.

In [10]:
#specify the features to work on and the target to predict
y = TrainDF["Survived"]
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
X_train = pd.get_dummies(TrainDF[features])
X_test = pd.get_dummies(TestDF[features])
#define the model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)

**Evaluation**
Evaluate by using Mean absolute error and Cross Validation (K-fold).

In [11]:
#split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X_train, y, random_state = 0)
model.fit(train_X, train_y)
#Make Predictions with Validation data
val_predictions = model.predict(val_X)
#Calculate the Mean Absolute Error in Validation Data
mae=mean_absolute_error(val_y, val_predictions)
#print the Mean Absolute Error
print("Mean Absolute Error: ",mae)

Mean Absolute Error:  0.17488789237668162


In [12]:
# Evaluate the model using Cross Validation (K-fold)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
scoring = 'accuracy'
score = cross_val_score(model, X_train, y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
# Random Forest Score
print("Random Forest Score =",round(np.mean(score)*100, 2))

[0.81111111 0.82022472 0.82022472 0.79775281 0.85393258 0.79775281
 0.83146067 0.85393258 0.83146067 0.78651685]
Random Forest Score = 82.04


**fit and predict**

In [13]:
#fit the data using Random Forest model
model.fit(X_train, y)
#predict the test data
predictions = model.predict(X_test)

# Produce predicted data
Make a CSV file of the predicted data wich contains only two columns the first one for passengerID and the second called survived. suvived column has two values 0 or 1. 0 indicate this person did not survive and 1 indicate that the person had survived.

In [14]:
#output the predicted data to csv file
output = pd.DataFrame({'PassengerId': TestDF.PassengerId, 'Survived': predictions})
output.to_csv('predictions.csv', index=False)
print("New prediction CSV File Is Now Ready")

New prediction CSV File Is Now Ready
