In [1]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240513%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240513T124844Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D02f8d67f65378d155e29c8e916d15a6be01d600734404867997b8236c68276b6f6fca4e2c46c3d05fa411f14e75d0b09b1026222ccc7a492f6330e46f952eb01bd6f8b13f15517ceced5dc8d09146ff68b05eaef260de10171793530024394df3a5755d6bee2a2a92e297880b6c7c48c27a4a214ddf8caac05fc598b58e3a429e55490111e7288bb0f40f38c0c0df721e5d99b1eaa924bd3ecbeb54ba16e195e88dd2661581bbdd2d75f8baa4cc9942f93d483e35b739941b5f5a1c77500c8529e140c5c18ad81d02c28067d8b430bda373285e4f823897ba3f6f4c1e8272abcce2e0c5c14f17387592051713aca10d1a17c80fca31c059d41cc631a278bd8c7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading titanic, 34877 bytes compressed
Downloaded and uncompressed: titanic
Data source import complete.


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [4]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [5]:
train_data.shape

(891, 12)

In [6]:
test_data.shape

(418, 11)

In [7]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
train_data['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [9]:
df_male_train = train_data[train_data['Sex']=='male']
df_female_train = train_data[train_data['Sex']=='female']
df_male_test = test_data[test_data['Sex']=='male']
df_female_test = test_data[test_data['Sex']=='female']

In [12]:
avg_age_men_train = int(df_male_train['Age'].mean())
avg_age_women_train = int(df_female_train['Age'].mean())
avg_age_men_test = int(df_male_test['Age'].mean())
avg_age_women_test = int(df_female_test['Age'].mean())

In [13]:
df_male_train.loc[:, 'Age'] = df_male_train['Age'].fillna(avg_age_men_train)
df_female_train.loc[:, 'Age'] = df_female_train['Age'].fillna(avg_age_women_train)
df_male_test.loc[:, 'Age'] = df_male_test['Age'].fillna(avg_age_men_test)
df_female_test.loc[:, 'Age'] = df_female_test['Age'].fillna(avg_age_women_test)

In [14]:
train_data.update(df_male_train)
train_data.update(df_female_train)
test_data.update(df_male_test)
test_data.update(df_female_test)

In [15]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [17]:
fare_mean = int(test_data['Fare'].mean())
test_data.loc[test_data['Fare'].isnull(), 'Fare'] = fare_mean

In [18]:
train_data.drop('Cabin', axis=1, inplace=True)
train_data.drop('PassengerId', axis=1, inplace=True)

In [19]:
test_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('PassengerId', axis=1, inplace=True)

In [21]:

mode_embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(mode_embarked, inplace=True)

In [22]:
train_embarked_encoded = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
test_embarked_encoded = pd.get_dummies(test_data['Embarked'], prefix='Embarked')

In [23]:
train_embarked_encoded

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
886,False,False,True
887,False,False,True
888,False,False,True
889,True,False,False


In [24]:
train_data_encoded = pd.concat([train_data, train_embarked_encoded], axis=1)
test_data_encoded = pd.concat([test_data, test_embarked_encoded], axis=1)

In [25]:
train_data_encoded.drop('Embarked', axis=1, inplace=True)
test_data_encoded.drop('Embarked', axis=1, inplace=True)

In [26]:
train_data_encoded['Sex'] = train_data_encoded['Sex'].map({'male': 0, 'female': 1})
test_data_encoded['Sex'] = test_data_encoded['Sex'].map({'male': 0, 'female': 1})

In [27]:
# Dropping Name and Test
test_data_encoded.drop('Name', axis=1, inplace=True)
test_data_encoded.drop('Ticket', axis=1, inplace=True)
train_data_encoded.drop('Name', axis=1, inplace=True)
train_data_encoded.drop('Ticket', axis=1, inplace=True)

In [28]:
train_data_encoded

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,False,False,True
1,1,1,1,38.0,1,0,71.2833,True,False,False
2,1,3,1,26.0,0,0,7.9250,False,False,True
3,1,1,1,35.0,1,0,53.1000,False,False,True
4,0,3,0,35.0,0,0,8.0500,False,False,True
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,False,False,True
887,1,1,1,19.0,0,0,30.0000,False,False,True
888,0,3,1,27.0,1,2,23.4500,False,False,True
889,1,1,0,26.0,0,0,30.0000,True,False,False


In [29]:
test_data_encoded

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,False,True,False
1,3,1,47.0,1,0,7.0000,False,False,True
2,2,0,62.0,0,0,9.6875,False,True,False
3,3,0,27.0,0,0,8.6625,False,False,True
4,3,1,22.0,1,1,12.2875,False,False,True
...,...,...,...,...,...,...,...,...,...
413,3,0,30.0,0,0,8.0500,False,False,True
414,1,1,39.0,0,0,108.9000,True,False,False
415,3,0,38.5,0,0,7.2500,False,False,True
416,3,0,30.0,0,0,8.0500,False,False,True


In [30]:
X_train = train_data_encoded.drop('Survived', axis=1)  # Features
y_train = train_data_encoded['Survived']  # Target variable

In [31]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [32]:
X_test = test_data_encoded
y_pred = model.predict(X_test)

In [33]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [35]:
print(X_test['Age'].mean())

30.216507177033492
