In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

## Data Analysis and Preprocessing:

In [5]:
# Survival Rate in Training Data
survived_stat = train_data['Survived'].value_counts()
print(survived_stat)
died = survived_stat[0]
survived = survived_stat[1]

print(f"{round((died / (died + survived)) * 100, 2)}% Passengers Died")
print(f"{round((survived / (died + survived)) * 100, 2)}% Passengers Survived")

0    549
1    342
Name: Survived, dtype: int64
61.62% Passengers Died
38.38% Passengers Survived


In [6]:
dependents = [feature for feature in list(train_data.columns) if feature not in ['Survived', 'PassengerId', 'Name']]
print(dependents)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [7]:
import matplotlib.pyplot as plt

# plt.style.use('seaborn')

# fig, axs = plt.subplots(len(dependents))

for index, feature in enumerate(dependents):
    print("\n", feature)
    
    print(train_data[feature].value_counts())
    feature_stat = dict(train_data[feature].value_counts())
    values = []
    rates = []
    for feature_val in feature_stat:
        values.append(feature_val)
        value = train_data.loc[train_data[feature] == feature_val]["Survived"]
        rates.append(sum(value) / len(value))
    
    print(pd.DataFrame(sorted(zip(values, rates))))
#     axs[index].plot(values, rates)
    


 Pclass
3    491
1    216
2    184
Name: Pclass, dtype: int64
   0         1
0  1  0.629630
1  2  0.472826
2  3  0.242363

 Sex
male      577
female    314
Name: Sex, dtype: int64
        0         1
0  female  0.742038
1    male  0.188908

 Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64
        0    1
0    0.42  1.0
1    0.67  1.0
2    0.75  1.0
3    0.83  1.0
4    0.92  1.0
..    ...  ...
83  70.00  0.0
84  70.50  0.0
85  71.00  0.0
86  74.00  0.0
87  80.00  1.0

[88 rows x 2 columns]

 SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64
   0         1
0  0  0.345395
1  1  0.535885
2  2  0.464286
3  3  0.250000
4  4  0.166667
5  5  0.000000
6  8  0.000000

 Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64
   0         1
0  0  0.343658
1  1  0.550847
2  2  0.500000
3  3  0.

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


### Handling NaN fields:

In [10]:
values = {"Age":-1, "Cabin": 'NA', "Embarked": 'NA', "Fare":-1}

In [11]:
# train = train_data.dropna(subset=['Age'])
train = train_data.fillna(value=values)

In [12]:
test = test_data.fillna(value=values)

In [13]:
#Adding Alone to train set
train['T_partner']=train["SibSp"]+train["Parch"]
train['Alone']=np.where(train['T_partner']>0, 0, 1)

test['T_partner']=test["SibSp"]+test["Parch"]
test['Alone']=np.where(test['T_partner']>0, 0, 1)

In [14]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,T_partner,Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,-1.0,1,2,W./C. 6607,23.4500,,S,3,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0,1


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
 12  T_partner    891 non-null    int64  
 13  Alone        891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [16]:
# test = test_data.fillna(value=values)

### Analysing attribute values for identifying categorical (and type of categorical) and numerical variables:

In [17]:
# Categorical Variables to be considered: Sex, Ticket, Cabin, Embarked
# All are nominal categorical variables, cabin may be considered ordinal, however some passengers have multiple cabin values,
# using dummies

### Preparing input data:

In [18]:
# Extracting independent output variable y
y = train["Survived"]

# Setting features to be used
features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "Alone", "Ticket"]

# features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Embarked"]


In [19]:
temp = train.copy()[features]
temp_test = test.copy()[features]

In [20]:
temp.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Alone,Ticket
0,3,male,22.0,7.25,S,0,A/5 21171
1,1,female,38.0,71.2833,C,0,PC 17599
2,3,female,26.0,7.925,S,1,STON/O2. 3101282
3,1,female,35.0,53.1,S,0,113803
4,3,male,35.0,8.05,S,1,373450


In [21]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(temp)

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [22]:
X_train = encoder.transform(temp)
X_test = encoder.transform(temp_test)

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
# scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
X_train.shape

(891, 1029)

## Training:

In [25]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

# model = RandomForestClassifier()
model = RandomForestClassifier(n_estimators=100, criterion='gini', ccp_alpha=0)
# model = AdaBoostClassifier(n_estimators=200, learning_rate=0.5)



In [26]:
scores = cross_val_score(model, X_train, y, cv=10)
print(scores)

[0.85555556 0.82022472 0.79775281 0.82022472 0.8988764  0.79775281
 0.78651685 0.76404494 0.84269663 0.80898876]


In [27]:
model.fit(X_train, y)
model.score(X_train, y)

0.9988776655443322

## Predictions:

In [28]:
predictions = model.predict(X_test)

In [29]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [30]:
output[['Survived']].value_counts()

Survived
0           292
1           126
dtype: int64