In [7]:
import pandas as pd

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/gender_submission.csv")
merged_data = pd.merge(test,submission,on="PassengerId")
# Add 'Survived' column to test set to align with train
# test["Survived"] = None  

# Stack them on top of each other
data = pd.concat([train, merged_data], ignore_index=True)

print(data.head())
print(data.tail())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [8]:
meanAge = data["Age"].mean().__round__(1)
# data.fillna({"Age": meanAge}, inplace=True)
data.fillna({"Age": meanAge, "Cabin": 0}, inplace=True)
data.dropna(subset=["Fare","Embarked","Cabin"], inplace=True)
print(len(data))

1306


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["Sex_Encoded"] = le.fit_transform(data["Sex"])
data_encoded = pd.get_dummies(data,columns=["Embarked"])
print(data_encoded.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin  Sex_Encoded  Embarked_C  \
0      0         A/5 21171   7.2500     0            1       False   
1      0          PC 17599  71.2833   C85            0        True   
2      0  STON/O2. 3101282   7.9250     0            0       False   
3      0            113803  53.1000  C12

In [17]:
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaled = standardScaler.fit_transform(data_encoded[["Fare","Pclass","Age"]])

data_encoded[["Fare","Pclass","Age"]] = pd.DataFrame(standardScaled,columns=["Fare","Pclass","Age"])
data_encoded.dropna(inplace=True)

In [31]:
from sklearn.model_selection import train_test_split
X = data_encoded[["Fare","Pclass","Age","Sex_Encoded"]]
y = data_encoded["Survived"]
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

prediction = model.predict(x_test)

In [33]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix
print("Accuracy Score: ",accuracy_score(y_test,prediction)*100)
print("Precision Score: ",precision_score(y_test,prediction)*100)
print("Recall Score: ",recall_score(y_test,prediction)*100)
print("F1 Score: ",f1_score(y_test,prediction)*100)
print("Confusion Matrix: ", confusion_matrix(y_test,prediction))

Accuracy Score:  87.6923076923077
Precision Score:  84.26966292134831
Recall Score:  80.64516129032258
F1 Score:  82.41758241758241
Confusion Matrix:  [[153  14]
 [ 18  75]]


In [34]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy as np

mse = mean_squared_error(y_test,prediction)
print("MAE: on average off by: ",mean_absolute_error(y_test,prediction)*100)
print("MSE: Squared mistake value: ",mse*100)
print("RMSE: Final Realistic Error: ",np.sqrt(mse)*100)


MAE: on average off by:  12.307692307692308
MSE: Squared mistake value:  12.307692307692308
RMSE: Final Realistic Error:  35.082320772281165
