In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [34]:
# Loading datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
validation_data = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [35]:
# Exploring training data
train_data.head()

In [36]:
train_data.info()

In [37]:
# Checking for duplicate rows
train_data[train_data.duplicated(['PassengerId'])]

In [38]:
# Checking for missing values
total = train_data.isnull().sum().sort_values(ascending=False)
total.plot(kind="bar", figsize = (12,8), fontsize =20,
                  xlabel="Columns",ylabel="Count", title="Total Missing Values")

So as there three columns with missing values Age, Cabin, and Embarked

For age I am going fill these by closet non-Nan value 
For cabin due to lot of missing values its better to leave this feature
and embarked missing values can be replaced with most frequent values

In [39]:
train_data['Embarked'].describe()

In [40]:
# imputing age
train_data['Age']=train_data['Age'].interpolate(method='nearest')

In [41]:
# imputing embarked
train_data['Embarked'].fillna('S',inplace=True)

In [42]:
# droping columns that are not required for prediction
train_data=train_data.drop(['Name', 'Ticket','Cabin'], axis=1)

Now ther is categorical data in embarked and sex column which have to be converted to numerical values

In [43]:
# Converting them to numerical values
from sklearn.preprocessing import LabelEncoder
sex = LabelEncoder().fit_transform(train_data['Sex'])
embark = LabelEncoder().fit_transform(train_data['Embarked'])

In [44]:
#droping old columns
train_data = train_data.drop(['Sex','Embarked'], axis=1)

In [45]:
# adding the numerical values in the respective columns
train_data['Sex'] = sex
train_data['Embarked'] = embark

In [46]:
# checking the new test data
train_data.info()

In [47]:
# Exploring test data
test_data.head()

In [48]:
# adding the missing column
test_data['Survived'] = validation_data['Survived'].copy()

In [49]:
# droping the same columns as in train data set
test_data = test_data.drop(['Cabin', 'Name', 'Ticket'], axis=1)

In [50]:
# Checking for missing values
total = test_data.isnull().sum().sort_values(ascending=False)
total.plot(kind="bar", figsize = (12,8), fontsize =20,
                  xlabel="Columns",ylabel="Count", title="Total Missing Values")

In [51]:
# handling missing values
test_data = test_data.interpolate(method='nearest', axis=0)

In [52]:
# converting categorical values to numerical
sex = LabelEncoder().fit_transform(test_data['Sex'])
embark = LabelEncoder().fit_transform(test_data['Embarked'])

In [53]:
#droping old columns
test_data = test_data.drop(['Sex','Embarked'], axis=1)

In [54]:
# adding the numerical values in the respective columns
test_data['Sex'] = sex
test_data['Embarked'] = embark

In [55]:
# checking the new test data
test_data.info()

In [56]:
# imputing the remaining  missing values in test_data
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

In [57]:
# checking again
test_data.info()

**Preparing for model**

In [58]:
# seprating train_data into label and features
trainY=train_data['Survived'].copy()
trainX=train_data.drop(['Survived'], axis=1).copy()

**Preparing for testing**

In [59]:
testY = test_data['Survived'].copy()
testX = test_data.drop(['Survived'], axis=1).copy()

**Scaling data so it have same ranges**

In [60]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_X = scaler.fit_transform(trainX)
test_X = scaler.fit_transform(testX)

**Using Logistic regression for this problem**

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logr = LogisticRegression().fit(train_X, trainY)
y_pred = logr.predict(test_X)
accuracy_score(testY, y_pred)

In [62]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty':('l1', 'l2', 'elasticnet', 'none'),
             'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
             'warm_start': ('False', 'True')}
lr = LogisticRegression()

clf = GridSearchCV(lr, parameters)
clf.fit(train_X, trainY)
y_pred = clf.predict(test_X)
accuracy_score(testY, y_pred)

In [63]:
# Results
results = test_data['Survived'].copy()
results.values[:] = 0
results.values[:] = y_pred
PassengerId = test_data['PassengerId']
df = PassengerId.to_frame()
df['Survived'] = results
df.to_csv('submission.csv', index=False)