In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
filename = "/kaggle/input/titanic/train.csv"
df = pd.read_csv(filename)
df.head()

In [3]:
df.isnull().sum()

In [4]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [5]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=df,palette='RdBu_r')

In [6]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=df,palette='RdBu_r')

In [7]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=df,palette='rainbow')

In [8]:
sns.displot(df['Age'].dropna(),kde=False,color='darkred',bins=30)

In [9]:
df['Age'].hist(bins=30,color='darkred',alpha=0.7)

In [10]:
sns.countplot(x='SibSp',data=df)

In [11]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=df,palette='winter')

In [12]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [13]:
df['Age'] = df[['Age','Pclass']].apply(impute_age,axis=1)

In [14]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [15]:
df.drop('Cabin',axis=1,inplace=True)

In [16]:
df.head()

In [17]:
df.dropna(inplace=True)

In [18]:
df.info()

In [19]:
sex = pd.get_dummies(df['Sex'],drop_first=False)
embark = pd.get_dummies(df['Embarked'],drop_first=False)

In [20]:
df.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [21]:
df = pd.concat([df,sex,embark],axis=1)

In [22]:
df.head()

In [23]:
sex


In [24]:
embark

In [25]:
sns.heatmap(df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(16,10)
plt.show()

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Pclass',axis=1), df['Survived'], test_size=0.30, random_state=101)

In [27]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter= 5000)
logmodel.fit(X_train,y_train)

In [28]:
predictions = logmodel.predict(X_test)

In [29]:
logmodel.score(X_test,y_test)

In [30]:
filename = "/kaggle/input/titanic/train.csv"
df2 = pd.read_csv(filename)
df2.head()

In [31]:
cols = {}
for col in df.columns:
    cols[col] = col.title()
df2 = df2.rename(columns=cols)

In [32]:
df2['embarked'] = df2.Embarked.fillna('S')
df2['Age'] = df2.Age.fillna(df.Age.median())

In [33]:
df2['Child'] = (df2.Age < 16).astype(int)
surv = df2[df2.Survived == 1]

In [34]:
fig, ([ax1,ax2],[ax3,ax4]) = plt.subplots(2,2,figsize=(12,14))

sns.set_style('dark')
sns.countplot(x='Sex', color='white', ax=ax2, data=df2, order=['male','female'])
sns.countplot(x='Sex', ax=ax2, data=surv,  order=['male','female'])
plt.ylabel('Survived')
ax2.set_xlabel('')
ax2.set_title('Gender')
ax2.set_ylabel('')

sns.countplot(x='Pclass', color='white', ax=ax3, data=df2,  order=[1, 2, 3])
sns.countplot(x='Pclass', ax=ax3, data=surv,  order=[1, 2, 3])
ax3.set_title('Passenger Class')
ax3.set_ylabel('Number Survived')
ax3.set_xlabel('Class')


sns.countplot(x='Child', color='white', ax=ax4, data=df2, order=[1,0])
sns.countplot(x='Child', ax=ax4, data=surv, order=[1,0])
loc, labels = plt.xticks()
plt.xticks(loc,['Child (<16 yrs)','Not Child'])
plt.ylabel('Survived')
ax4.set_title('Children')
ax4.set_ylabel('')

In [35]:
df2.info()

In [36]:
sns.set_style('whitegrid')
fig, axis = plt.subplots(figsize=(8,8))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df2, ax=axis, ci=None)
axis.set_title('Survival Rate by Passenger Class and Gender')
loc, labels = plt.xticks()
plt.xticks(loc, ['First Class','Second Class', 'Third Class'])
axis.set_ylabel('Percent Survived')
axis.set_xlabel('')