In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    # Removing unnecessary columns 
    df.drop(columns=["Cabin", "Ticket", "Name", "Embarked"], inplace=True)
    # Change Sex from male to 1 and female to 0
    df["Sex"] = (df["Sex"].str[0]=="m").replace("m", 1).astype(int)
    # forward filling those columns which have NaN values
    df["Age"] = df["Age"].ffill()
    df["Fare"] = df["Fare"].ffill()
    
    return df


In [3]:
train = wrangle("/kaggle/input/titanic/train.csv")
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB
None


In [4]:
test = wrangle("/kaggle/input/titanic/test.csv")
test_ids = test["PassengerId"]
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 23.0 KB


In [5]:
target = "Survived"
X = train.drop(columns="Survived")
y= train[target]
print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  (891, 7)
y shape:  (891,)


In [6]:
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
acc_baseline = y.value_counts(normalize=True).max()
print("Baseline Accuracy", round(acc_baseline, 4))

Baseline Accuracy 0.6162


In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [9]:
acc_train = model.score(X_train, y_train)
acc_train

0.797752808988764

In [10]:
acc_val = model.score(X_val, y_val)
acc_val

0.7932960893854749

In [11]:
predictions = model.predict(test)
df = pd.DataFrame({"PassengerId":test_ids.values,
                  "Survived":predictions})
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [12]:
df.to_csv("predictions.csv", index=False)