In [5]:
!pip install patool
import pandas as pd
import numpy as np
import patoolib
import seaborn as sns
import matplotlib.pyplot as plt



In [6]:
# Extracts the contents of 'titanik.rar' into the 'titanik' folder
patoolib.extract_archive("titanik.rar", outdir="titanik")

PatoolError: file `titanik.rar' was not found

In [None]:
# Walk through the 'titanik' directory and print full paths of all files inside it
import os
for dirnames, _, filenames in os.walk("titanik"):
    for filename in filenames:
        print(os.path.join(dirnames, filename))

In [None]:
# Load the training dataset from the extracted folder
train_df = pd.read_csv("titanik/titanik/train.csv")

# Load the test dataset from the extracted folder
test_df = pd.read_csv("titanik/titanik/test.csv")


In [None]:
# Display a concise summary of the DataFrame, including column names, non-null counts, and data types
train_df.info()


In [None]:
# Combine train and test DataFrames into one, adding keys to distinguish between them
combined = pd.concat([train_df, test_df], axis=0, keys=["train", "test"])
# Convert the 'Sex' column into dummy/indicator variables and drop the first category to avoid multicollinearity
combined = pd.get_dummies(combined, columns=["Sex"], drop_first=True)

In [None]:
# Fill missing values in numeric columns with the column-wise mean
combined.fillna(combined.mean(numeric_only=True), inplace=True)

In [None]:
features = ["Pclass", "Sex_male", "SibSp", "Parch"]

In [None]:
# Extract the 'train' portion of the combined DataFrame using the key
train_data = combined.xs("train")

# Extract the 'test' portion of the combined DataFrame using the key
test_data = combined.xs("test")


In [None]:
# Check if the 'Survived' column exists in the train_data DataFrame
if "Survived" in train_data.columns:
    # Select the 'Survived' values for female passengers (Sex_male == 0
    women = train_data.loc[train_data["Sex_male"] == 0, "Survived"]
    # Select the 'Survived' values for male passengers (Sex_male == 1)
    men = train_data.loc[train_data["Sex_male"] == 1, "Survived"]

    # Calculate and print the survival rate for women and men
    print("% of women who survived:", sum(women)/len(women))
    print("% of men who survived:", sum(men)/len(men))


In [None]:
# Import the RandomForestClassifier from scikit-learn
from sklearn.ensemble import RandomForestClassifier

# Select feature and target columns for training
x_train = train_data[features]
y_train = train_data["Survived"]

# Create a Random Forest model with 100 trees and maximum depth of 5
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(x_train, y_train)


In [None]:
# Prepare test features, predict with model, and print first 10 predictions
x_test = test_data[features]
predictions = model.predict(x_test)
print("Predictions on test set (first 10):", predictions[:10])