In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pickle


sns.set()

In [4]:
test_path = "test.csv"
train_path = "train.csv"

x_test = pd.read_csv(test_path)
x_train = pd.read_csv(train_path)

In [5]:
x_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
x_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
categorical = []
numerical = []

for column in x_train.columns:
  if x_train[column].dtype == "object":
    categorical.append(column)
  else:
    numerical.append(column)

In [9]:
values = x_train["Survived"].value_counts()

In [10]:
def extract_first(x):
  if str(x)[0:3] != "nan":
    return str(x)[0]

x_train["Cabin_first"] = x_train.Cabin.apply(lambda x: extract_first(x))

In [11]:
x_train = x_train.drop("Cabin_first", axis = 1)

In [12]:
x_train["Ticket_first"] = x_train["Ticket"].apply(lambda x: str(x)[0])

In [13]:
x_train = x_train.drop("Ticket_first", axis = 1)

In [14]:
x_train["Title"] = x_train['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [15]:
x_train = x_train.drop("Title", axis = 1)

In [16]:
data1 = x_train.copy()
data2 = x_test.copy()

In [17]:
def missing_val_table(data):
    missing_val = data.isnull().sum()
    missing_val_perc = 100 * data.isnull().sum() / len(data)
    table = pd.concat([missing_val, missing_val_perc], axis=1)
    table = table.rename(columns = {0:"Missing Values",1:"% of Total Values"})
    table = table.sort_values(by="% of Total Values",ascending=False)
    return table

missing_val_table(data1)

Unnamed: 0,Missing Values,% of Total Values
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0


In [18]:
data1["Age"] = data1["Age"].fillna(data1["Age"].mean())
data2["Age"] = data2["Age"].fillna(data2["Age"].mean())

data1["Embarked"] = data1["Embarked"].fillna(data1["Embarked"].mode()[0])
data2["Embarked"] = data2["Embarked"].fillna(data2["Embarked"].mode()[0])

In [19]:
def assign_label_cabin(cabin):
    if cabin in ["D", "E", "B", "F", "C"]:
        return "Cabin_high"
    elif cabin in ["G", "A"]:
        return "Cabin_middle"
    else:
        return "Cabin_low"

data1["Cabin"] = data1["Cabin"].apply(lambda x: str(x)[0])
data2["Cabin"] = data2["Cabin"].apply(lambda x: str(x)[0])

data1["Cabin_first"] = data1["Cabin"].apply(lambda x: assign_label_cabin(x))
data2["Cabin_first"] = data2["Cabin"].apply(lambda x: assign_label_cabin(x))

data1 = data1.drop("Cabin", axis = 1)
data2 = data2.drop("Cabin", axis = 1)

In [20]:
def fare_bound(x):
  x = float(x)
  if (x > 384.247) & (x <= 512.329):
    return "Very High Fare"
  elif (x > 256.165) & (x <= 384.247):
    return "High Fare"
  elif (x > 128.082) & (x <= 256.165):
    return "Normal Fare"
  else:
    return "Low Fare"

data1["Fare_cat"] = data1["Fare"].apply(lambda x: fare_bound(x))
data2["Fare_cat"] = data2["Fare"].apply(lambda x: fare_bound(x))

data1 = data1.drop("Fare", axis = 1)
data2 = data2.drop("Fare", axis = 1)

In [21]:
def label_ticket(x):
    if x in ["F", "1", "P", "9"]:
        return "Ticket_high"
    elif x in ["S", "C", "2"]:
        return "Ticket_middle"
    else:
        return "Ticket_low"

data1["Ticket"] = data1["Ticket"].apply(lambda x: str(x)[0])
data2["Ticket"] = data2["Ticket"].apply(lambda x: str(x)[0])

data1["Ticket_cat"] = data1["Ticket"].apply(lambda x: label_ticket(x))
data2["Ticket_cat"] = data2["Ticket"].apply(lambda x: label_ticket(x))


data1 = data1.drop("Ticket", axis = 1)
data2 = data2.drop("Ticket", axis = 1)

In [22]:
def assign_label_title(title):
    if title in ["the Countess", "Mlle", "Lady", "Ms", "Sir", "Mme", "Mrs", "Miss", "Master"]:
        return "Title_high"
    elif title in ["Major", "Col", "Dr"]:
        return "Title_middle"
    else:
        return "Title_low"

data1["Title"] = data1['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
data2["Title"] = data2['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

data1["Title"] = data1["Title"].apply(lambda x: assign_label_title(x))
data2["Title"] = data2["Title"].apply(lambda x: assign_label_title(x))

data1 = data1.drop("Name", axis = 1)
data2 = data2.drop("Name", axis = 1)

In [23]:
data1["family_size"] = data1["SibSp"] + data1["Parch"]
data2["family_size"] = data2["SibSp"] + data2["Parch"]

In [24]:
def family_label(family_size):
    if family_size == 0:
        return "Alone"
    elif family_size <=3:
        return "Small_family"
    else:
        return "Big_family"

data1["family_size"] = data1["family_size"].apply(lambda x: family_label(x))
data2["family_size"] = data2["family_size"].apply(lambda x: family_label(x))

data1 = data1.drop("SibSp", axis=1)
data1 = data1.drop("Parch", axis =1)

data2 = data2.drop("Parch", axis =1)
data2 = data2.drop("SibSp", axis =1)

In [25]:
data1 = data1.drop("PassengerId", axis = 1)
data2 = data2.drop("PassengerId", axis = 1)

In [26]:
data1_new = data1.copy()
data2_new = data2.copy()

In [27]:

labelEncoder = LabelEncoder()
data1_new["Sex"] = labelEncoder.fit_transform(data1[["Sex"]].values.ravel())
data2_new["Sex"] = labelEncoder.fit_transform(data2_new[["Sex"]].values.ravel())

In [28]:
data1_new = pd.get_dummies(columns=["Pclass", "Embarked", "Ticket_cat", "Fare_cat","Cabin_first","Title", "family_size"], data=data1_new, drop_first=True)
data2_new = pd.get_dummies(columns=["Pclass", "Embarked", "Ticket_cat", "Fare_cat", "Cabin_first","Title", "family_size"], data=data2_new, drop_first=True)

In [29]:
train = data1_new.copy()
test = data2_new.copy() 

X = train.drop("Survived", axis = 1)
y = train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42, stratify=y)

### **Random Forest**

In [30]:
rf = RandomForestClassifier()

params = {'n_estimators': [100,300,500,700,1000],
          'max_depth': [3,5,7],
          'criterion':['entropy', 'gini'],
          'min_samples_leaf' : [1, 2, 3, 4, 5],
          'max_features':['auto'],
          'min_samples_split': [3, 5, 10],
          'max_leaf_nodes':[2,3,5,7],
          }

rf_cv = RandomizedSearchCV(rf, params, cv = 10, n_jobs=-1, verbose=2).fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
86 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/justin/miniconda3/envs/MachineLearning/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.


In [None]:
rf_cv.best_params_
best_rf_model = rf_cv.best_estimator_

In [None]:
rf_pred = rf_cv.predict(X_test)

In [None]:
Importance = pd.DataFrame({"Importance": best_rf_model.feature_importances_*100},
                         index = X_train.columns)
Importance.sort_values(by = "Importance", axis = 0,ascending = True)

In [None]:
last_model=RandomForestClassifier(max_depth=3, max_leaf_nodes=7, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=500).fit(X,y)

In [None]:
IDs = pd.read_csv(test_path)[["PassengerId"]].values
predictions = last_model.predict(test.values)

<a id="6"></a> <br>
# **Submission**

In [None]:
result_df = {'PassengerId': IDs.ravel(), 'Survived':predictions}
submission = pd.DataFrame(result_df)
display(submission.head())

In [None]:
submission.to_csv("titanic_sub.csv", index=False)