# Importing the Libraries

In [None]:
import numpy as np
np.set_printoptions(threshold=np.inf)
import pandas as pd
import matplotlib.pyplot as plt

# Importing Dataset

In [None]:
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

# Analysing Data

In [None]:
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
titanic_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


# Finding Missing Values

In [None]:
titanic_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
titanic_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Dropping redundant features

In [None]:
titanic_train = titanic_train.drop(columns=['Name','Cabin','Ticket'])

In [None]:
titanic_test = titanic_test.drop(columns=['Name','Cabin','Ticket'])

# Converting Dataframes to multidimensional arrays

In [None]:
# Titanic train set is stored in an array
X_1 = titanic_train.iloc[:,0:1].values
X_2 = titanic_train.iloc[:,2:].values
X_train = np.concatenate((X_1,X_2), axis=1)

# Survied column stored in the form of vector 
y_train = titanic_train.iloc[:,1].values 

In [None]:
# Titanic test set stored in an array
X_test = titanic_test.iloc[:,:].values

# Handling missing data in training and test set

In [None]:
from sklearn.impute import SimpleImputer

# Adding 'median' in missing age values for training set
imputer_age = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer_age.fit(X_train[:,3:4])
X_train[:,3:4] = imputer_age.transform(X_train[:,3:4])

In [None]:
# Adding 'most_frequent' in missing embarked values for training set
imputer_embarked = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer_embarked.fit(X_train[:,-1:])
X_train[:,-1:] = imputer_embarked.transform(X_train[:,-1:])

In [None]:
# Adding 'median' in missing age values for test set
imputer_age.fit(X_test[:,3:4])
X_test[:,3:4] = imputer_age.transform(X_test[:,3:4])

In [None]:
# Adding 'mean' in missing fare values for test set
imputer_fare = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer_fare.fit(X_test[:,-2:-1])
X_test[:,-2:-1] = imputer_fare.transform(X_test[:,-2:-1])

# Encoding Categorical Data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoding gender in training set
ct_gender = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[2])],remainder = 'passthrough')
X_train = np.array(ct_gender.fit_transform(X_train))

In [None]:
# OneHotEncoding Embarked in training set
ct_embarked = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[-1])],remainder = 'passthrough')
X_train = np.array(ct_embarked.fit_transform(X_train))

In [None]:
# OneHotEncoding gender in test set
X_test = np.array(ct_gender.fit_transform(X_test))

In [None]:
# OneHotEncoding Embarked in test set
X_test = np.array(ct_embarked.fit_transform(X_test))

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# Bringing features on same scale for both set
X_train[:,5:6] = sc.fit_transform(X_train[:,5:6])
X_train[:,7:] = sc.fit_transform(X_train[:,7:])
X_test[:,5:6] = sc.fit_transform(X_test[:,5:6])
X_test[:,7:] = sc.fit_transform(X_test[:,7:])


In [None]:
# Converting the arrays from object to float64
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

# Training Random Forest on training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
# KNN
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
# classifier.fit(X_train,y_train)

# Logistic Regression
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression(random_state = 0)
# classifier.fit(X_train,y_train)

# Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

# Kernel SVM
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'rbf', random_state = 0)
# classifier.fit(X_train, y_train)

# Validating the Test Set for Random Forest

In [None]:
y_test = classifier.predict(X_test)

# Accuracy

In [None]:
from sklearn.metrics import accuracy_score
classifier.score(X_train, y_train)
classifier = round(classifier.score(X_train, y_train) * 100, 2)
classifier

97.76

# Creating CSV File

In [None]:
validator = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": y_test
    })

In [None]:
validator.to_csv('submission_randomForest.csv', index=False)