In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import Dataset:

In [2]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()

In [3]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()

In [4]:
gender = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
gender.head()

Analyse data from the target variable:

In [5]:
not_survived = round(train['Survived'].value_counts()[0]/len(train)*100,2)
survived = round(train['Survived'].value_counts()[1]/len(train)*100,2)

print(f'{not_survived} % пассажиров погибло')
print(f'{survived} % пассажиров выжило')

female = train[train['Sex']=='female']['Survived']
male = train[train['Sex']=='male']['Survived']

print(round(sum(female)/len(female)*100,2),'% женщин выжило')
print(round(sum(male)/len(male)*100,2), '% мужчин выжило')

Variable information:

In [6]:
train.info()

Survival assessment:

In [7]:
sns.countplot(x='Survived', data=train)
plt.show()

Check numerical features:

In [8]:
plt.figure(figsize=(14,8))
plt.subplot(2,2,1)
sns.distplot(train['Age'])
plt.subplot(2,2,2)
sns.boxplot(train['Age'])
plt.subplot(2,2,3)
sns.distplot(train['Fare'])
plt.subplot(2,2,4)
sns.boxplot(train['Fare'])
plt.show()

Other variables from target variable on the plot:

In [9]:
plt.subplot(5,2,1)
sns.barplot(x='Sex',y='Survived', data=train, estimator=np.sum)
plt.subplot(5,2,2)
sns.countplot(x='Sex', data=train, hue='Survived')

plt.subplot(5,2,3)
sns.barplot(x='Pclass',y='Survived', data=train, estimator=np.sum)
plt.subplot(5,2,4)
sns.countplot(x='Pclass', data=train, hue='Survived')

plt.subplot(5,2,5)
sns.barplot(x='Embarked',y='Survived', data=train, estimator=np.sum)
plt.subplot(5,2,6)
sns.countplot(x='Embarked', data=train,hue='Survived')

plt.subplot(5,2,7)
sns.barplot(x='Parch',y='Survived', data=train, estimator=np.sum)
plt.subplot(5,2,8)
sns.countplot(x='Parch', data=train,hue='Survived')

plt.subplot(5,2,9)
sns.barplot(x='SibSp',y='Survived', data=train, estimator=np.sum)
plt.subplot(5,2,10)
sns.countplot(x='SibSp', data=train,hue='Survived')
plt.show()

**Preparing data**

Check null values in the dataset:

In [10]:
null = pd.DataFrame(train.isnull().sum(), columns=['null'])
null['null_percentage'] = round(train.isnull().sum()/len(train)*100,2)
null

Impute the null values:

In [11]:
# impute age with the median
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(train['Age'].median(),inplace=True)
#impute embarked with mode
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)
test['Fare'].fillna(train['Fare'].median(), inplace=True)
# cabin value has more than 77% of null values; imputing will distort the dataset. 
train.drop(['PassengerId','Cabin','Ticket','Name'], axis=1,inplace=True)
test.drop(['Cabin','Ticket','Name'], axis=1,inplace=True)

train.info()
print('*'*50)
test.info()

**Feature Engineering**

In [12]:
# family members
train['family_size'] = train['SibSp'] + train['Parch'] + 1
test['family_size'] = test['SibSp'] + test['Parch'] + 1

# single or with family
train['single'] = train['family_size'].apply(lambda x: 0 if x>1 else 1)
test['single'] = train['family_size'].apply(lambda x: 0 if x>1 else 1)

train.head()

Label enoding:

In [13]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

train['Sex'] = label.fit_transform(train['Sex'])
test['Sex'] = label.fit_transform(test['Sex'])

train['Embarked'] = label.fit_transform(train['Embarked'])
test['Embarked'] = label.fit_transform(test['Embarked'])

train.head()

In [14]:
test.head()

In [15]:
plt.figure(figsize=(12,10))
sns.heatmap(train.corr(),annot=True)
plt.show()

Removing highly correlated features - SibSp, Parch:

In [16]:
train = train.drop(['SibSp', 'Parch'], axis=1)
test = test.drop(['SibSp', 'Parch'], axis=1)

**Model Data**

In [17]:
X = train.drop('Survived',axis=1)
y = train['Survived']
X_test = test.drop('PassengerId',axis=1)

X.head()

In [18]:
X_test.head()

In [19]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X,y)
prediction = model.predict(X_test)

In [20]:
output = pd.DataFrame({'PassengerId' : test['PassengerId'], 'Survived' : prediction})
output.to_csv('submission.csv',index=False)