In [51]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import matplotlib.pyplot as plt
import data_prep as dpr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

In [52]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

X_train = df_train.drop(['Survived'], axis=1)

# y is the target variable [SalePrice]
y = pd.DataFrame(df_train['Survived'])

# X is the combination of the features data from 'train' and 'test'. This is where we'll deal with the NaV values.
X = pd.concat([X_train, df_test], axis=0)

y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Survived  891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB


In [53]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [54]:
unique_values = X.nunique()

print(unique_values)

PassengerId    1309
Pclass            3
Name           1307
Sex               2
Age              98
SibSp             7
Parch             8
Ticket          929
Fare            281
Cabin           186
Embarked          3
dtype: int64


In [55]:
dpr.data_faults(X)

             NaN Count  Percentage [%]
Column Name                           
Cabin             1014       77.463713
Age                263       20.091673
Embarked             2        0.152788
Fare                 1        0.076394

The DataFrame does not contain columns with uniformed values.

The DataFrame does not contain any dupplicated records.


In [56]:
df_test.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
X_train.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
X.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [43]:
numCols = list(X.select_dtypes(exclude='object').columns)
catCols = list(X.select_dtypes(include='object').columns)
print(f"There are {len(numCols)} numerical features:\n", numCols)
print(f"There are {len(catCols)} categorical features:\n", catCols)

There are 6 numerical features:
 ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
There are 5 categorical features:
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [44]:
# Uniquw values of all categorical columns
col_cat = ["Pclass", "SibSp", "Parch", "Sex", "Embarked"]
unique_values = dpr.unique_values(X[col_cat])

   Pclass  SibSp  Parch     Sex Embarked
0     1.0    0.0      0  female        C
1     2.0    1.0      1    male        Q
2     3.0    2.0      2     NaN        S
3     NaN    3.0      3     NaN      NaN
4     NaN    4.0      4     NaN      NaN
5     NaN    5.0      5     NaN      NaN
6     NaN    8.0      6     NaN      NaN
7     NaN    NaN      9     NaN      NaN


In [45]:
women = df_train.loc[df_train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)


% of women who survived: 0.7420382165605095


In [49]:
from sklearn.ensemble import RandomForestClassifier

y = df_train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(df_train[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
