In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv("data/train.csv")
data.sample(random_state=20250325, frac=1)
data, data_test = data[:841], data[841:]

In [3]:
data.shape, data_test.shape

((841, 12), (50, 12))

In [4]:
X, y = data.drop("Survived", axis=1), data["Survived"]

In [5]:
X.shape, y.shape

((841, 11), (841,))

## Pierwsze podejście: bez nazwisk

### Pipeline

In [6]:
def preprocess(data):
    data = data.copy()
    data["Cabin_letter"] = data["Cabin"].str.slice(0, 1)
    data.loc[data["Cabin_letter"].isna(), "Cabin_letter"] = ""

    cat_type = pd.CategoricalDtype(sorted(data["Cabin_letter"].unique()), ordered=True)
    data["Cabin_letter"] = data["Cabin_letter"].astype(cat_type)

    data["Name"] = data["Name"].str.split(",").str[0]

    data = data.drop(["PassengerId", "Ticket", "Cabin", "Name"], axis=1)

    data["Sex"] = (data["Sex"] == "male")

    return data

In [7]:
X = preprocess(X)

In [8]:
X.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_letter
0,3,True,22.0,1,0,7.25,S,
1,1,False,38.0,1,0,71.2833,C,C


In [9]:
cat_columns = ["Pclass", "SibSp", "Parch", "Embarked", "Cabin_letter"]
num_columns = ["Age", "Fare"]

In [10]:
pipeline = ColumnTransformer([
        ("cats", OneHotEncoder(handle_unknown="ignore"), cat_columns),
        ("nums", make_pipeline(
                SimpleImputer(strategy="mean"),
                StandardScaler()
            ),
         num_columns
        )
    ]
)

In [11]:
X.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,841.0,671.0,841.0,841.0,841.0
mean,2.309156,29.67611,0.516052,0.374554,32.629567
std,0.836389,14.564454,1.060118,0.795086,50.652015
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.25,0.0,0.0,7.925
50%,3.0,28.0,0.0,0.0,14.4583
75%,3.0,38.0,1.0,0.0,31.275
max,3.0,80.0,8.0,6.0,512.3292


In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Pclass        841 non-null    int64   
 1   Sex           841 non-null    bool    
 2   Age           671 non-null    float64 
 3   SibSp         841 non-null    int64   
 4   Parch         841 non-null    int64   
 5   Fare          841 non-null    float64 
 6   Embarked      839 non-null    object  
 7   Cabin_letter  841 non-null    category
dtypes: bool(1), category(1), float64(2), int64(3), object(1)
memory usage: 41.6+ KB


In [13]:
X_p = pipeline.fit_transform(X)

### Ewaluacja modeli

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
knn = KNeighborsClassifier()

In [16]:
knn.fit(X_p, y)

In [17]:
X_test, y_test = data_test.drop("Survived", axis=1), data_test["Survived"]

In [18]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
841,842,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5,,S
842,843,1,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0,,C
843,844,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C
844,845,3,"Culumovic, Mr. Jeso",male,17.0,0,0,315090,8.6625,,S
845,846,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S


In [19]:
X_test = preprocess(X_test)

In [20]:
X_test = pipeline.transform(X_test)

In [21]:
y_pred = knn.predict(X_test)

In [22]:
(y_pred == y_test).mean()

np.float64(0.76)

In [23]:
for n in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_p, y)
    print(f"Nejghbours: {n} --> Accuracy: {(knn.predict(X_test) == y_test).mean()*100}%")

Nejghbours: 1 --> Accuracy: 72.0%
Nejghbours: 2 --> Accuracy: 66.0%
Nejghbours: 3 --> Accuracy: 72.0%
Nejghbours: 4 --> Accuracy: 80.0%
Nejghbours: 5 --> Accuracy: 76.0%
Nejghbours: 6 --> Accuracy: 74.0%
Nejghbours: 7 --> Accuracy: 80.0%
Nejghbours: 8 --> Accuracy: 74.0%
Nejghbours: 9 --> Accuracy: 76.0%


In [24]:
max_score = 0
for n in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=n, weights="distance")
    knn.fit(X_p, y)
    score = (knn.predict(X_test) == y_test).mean()
    if score > max_score: max_score = score
    print(f"Nejghbours: {n} --> Accuracy: {score*100}%")
print("Max accuracy:", max_score)

Nejghbours: 1 --> Accuracy: 72.0%
Nejghbours: 2 --> Accuracy: 74.0%
Nejghbours: 3 --> Accuracy: 72.0%
Nejghbours: 4 --> Accuracy: 78.0%
Nejghbours: 5 --> Accuracy: 76.0%
Nejghbours: 6 --> Accuracy: 78.0%
Nejghbours: 7 --> Accuracy: 74.0%
Nejghbours: 8 --> Accuracy: 72.0%
Nejghbours: 9 --> Accuracy: 72.0%
Max accuracy: 0.78


In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_p, y)
(model.predict(X_test) == y_test).mean()

np.float64(0.76)

## Drugie podejście: uwzględniamy nazwiska
Sprobujmy poszukac zaleznosci miedzy tymi samymi czlonkami rodziny, czyli zostawmy nazwisko -> przez oneHotEncoding
### Pipeline

In [27]:
def preprocess(data):
    data = data.copy()
    data["Cabin_letter"] = data["Cabin"].str.slice(0, 1)
    data.loc[data["Cabin_letter"].isna(), "Cabin_letter"] = ""

    cat_type = pd.CategoricalDtype(sorted(data["Cabin_letter"].unique()), ordered=True)
    data["Cabin_letter"] = data["Cabin_letter"].astype(cat_type)

    data["Name"] = data["Name"].str.split(",").str[0]

    data = data.drop(["PassengerId", "Ticket", "Cabin"], axis=1)

    data["Sex"] = (data["Sex"] == "male")

    return data

cat_columns = ["Pclass", "SibSp", "Parch", "Embarked", "Cabin_letter", "Name"]
num_columns = ["Age", "Fare"]

pipeline = ColumnTransformer(
    [
        ("cats", OneHotEncoder(handle_unknown="ignore"), cat_columns),
        ("nums", 
             make_pipeline(
                     SimpleImputer(strategy="mean"),
                     StandardScaler()
                    ),
            num_columns)
    ]
)

In [28]:
X_train, y = data.drop("Survived", axis=1), data["Survived"]
X_train = preprocess(X_train)

In [29]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_letter
0,3,Braund,True,22.0,1,0,7.25,S,
1,1,Cumings,False,38.0,1,0,71.2833,C,C
2,3,Heikkinen,False,26.0,0,0,7.925,S,
3,1,Futrelle,False,35.0,1,0,53.1,S,C
4,3,Allen,True,35.0,0,0,8.05,S,


In [30]:
X_train = pipeline.fit_transform(X_train)

In [31]:
X_test, y_test = data_test.drop("Survived", axis=1), data_test["Survived"]
X_test = preprocess(X_test)
X_test = pipeline.transform(X_test)

### Testowanie modeli

In [32]:
for n in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y)
    print(f"Nejghbours: {n} --> Accuracy: {(knn.predict(X_test) == y_test).mean():.2f}%")

Nejghbours: 1 --> Accuracy: 0.74%
Nejghbours: 2 --> Accuracy: 0.70%
Nejghbours: 3 --> Accuracy: 0.74%
Nejghbours: 4 --> Accuracy: 0.80%
Nejghbours: 5 --> Accuracy: 0.76%
Nejghbours: 6 --> Accuracy: 0.74%
Nejghbours: 7 --> Accuracy: 0.80%
Nejghbours: 8 --> Accuracy: 0.74%
Nejghbours: 9 --> Accuracy: 0.76%
Nejghbours: 10 --> Accuracy: 0.74%
Nejghbours: 11 --> Accuracy: 0.74%
Nejghbours: 12 --> Accuracy: 0.76%
Nejghbours: 13 --> Accuracy: 0.74%
Nejghbours: 14 --> Accuracy: 0.74%
Nejghbours: 15 --> Accuracy: 0.74%
Nejghbours: 16 --> Accuracy: 0.74%
Nejghbours: 17 --> Accuracy: 0.74%
Nejghbours: 18 --> Accuracy: 0.74%
Nejghbours: 19 --> Accuracy: 0.74%


In [33]:
for n in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=n, weights="distance")
    knn.fit(X_train, y)
    print(f"Nejghbours: {n} --> Accuracy: {(knn.predict(X_test) == y_test).mean():.2f}%")

Nejghbours: 1 --> Accuracy: 0.74%
Nejghbours: 2 --> Accuracy: 0.76%
Nejghbours: 3 --> Accuracy: 0.74%
Nejghbours: 4 --> Accuracy: 0.80%
Nejghbours: 5 --> Accuracy: 0.76%
Nejghbours: 6 --> Accuracy: 0.76%
Nejghbours: 7 --> Accuracy: 0.78%
Nejghbours: 8 --> Accuracy: 0.76%
Nejghbours: 9 --> Accuracy: 0.76%
Nejghbours: 10 --> Accuracy: 0.76%
Nejghbours: 11 --> Accuracy: 0.74%
Nejghbours: 12 --> Accuracy: 0.76%
Nejghbours: 13 --> Accuracy: 0.74%
Nejghbours: 14 --> Accuracy: 0.74%
Nejghbours: 15 --> Accuracy: 0.74%
Nejghbours: 16 --> Accuracy: 0.74%
Nejghbours: 17 --> Accuracy: 0.76%
Nejghbours: 18 --> Accuracy: 0.72%
Nejghbours: 19 --> Accuracy: 0.74%


In [34]:
model = LogisticRegression()
model.fit(X_train, y)
(model.predict(X_test) == y_test).mean()

np.float64(0.78)

In [35]:
for depth in range(1, 11):
    for leaf_nodes in [10, 20, 30, 40, 50, 100, 200, 400, 500, 700, 1000]:
        if leaf_nodes > 2**depth:
            break
        model = DecisionTreeClassifier(max_depth=depth, max_leaf_nodes=leaf_nodes)
        model.fit(X_train, y)
        print(f"{depth:02d}x{leaf_nodes} --> Accuracy:{(model.predict(X_test) == y_test).mean():.2f}")

04x10 --> Accuracy:0.80
05x10 --> Accuracy:0.80
05x20 --> Accuracy:0.76
05x30 --> Accuracy:0.76
06x10 --> Accuracy:0.80
06x20 --> Accuracy:0.78
06x30 --> Accuracy:0.74
06x40 --> Accuracy:0.74
06x50 --> Accuracy:0.74
07x10 --> Accuracy:0.80
07x20 --> Accuracy:0.78
07x30 --> Accuracy:0.78
07x40 --> Accuracy:0.78
07x50 --> Accuracy:0.76
07x100 --> Accuracy:0.76
08x10 --> Accuracy:0.80
08x20 --> Accuracy:0.78
08x30 --> Accuracy:0.80
08x40 --> Accuracy:0.78
08x50 --> Accuracy:0.78
08x100 --> Accuracy:0.80
08x200 --> Accuracy:0.78
09x10 --> Accuracy:0.80
09x20 --> Accuracy:0.78
09x30 --> Accuracy:0.80
09x40 --> Accuracy:0.78
09x50 --> Accuracy:0.78
09x100 --> Accuracy:0.76
09x200 --> Accuracy:0.78
09x400 --> Accuracy:0.74
09x500 --> Accuracy:0.74
10x10 --> Accuracy:0.80
10x20 --> Accuracy:0.78
10x30 --> Accuracy:0.80
10x40 --> Accuracy:0.78
10x50 --> Accuracy:0.78
10x100 --> Accuracy:0.76
10x200 --> Accuracy:0.76
10x400 --> Accuracy:0.74
10x500 --> Accuracy:0.74
10x700 --> Accuracy:0.78
10x1

In [36]:
for leaf_nodes in range(30, 51):
    model = DecisionTreeClassifier(max_depth=8, max_leaf_nodes=leaf_nodes)
    model.fit(X_train, y)
    print(f"{8}x{leaf_nodes} --> Accuracy:{(model.predict(X_test) == y_test).mean():.2f}%")

8x30 --> Accuracy:0.80%
8x31 --> Accuracy:0.78%
8x32 --> Accuracy:0.78%
8x33 --> Accuracy:0.78%
8x34 --> Accuracy:0.78%
8x35 --> Accuracy:0.78%
8x36 --> Accuracy:0.78%
8x37 --> Accuracy:0.78%
8x38 --> Accuracy:0.78%
8x39 --> Accuracy:0.78%
8x40 --> Accuracy:0.78%
8x41 --> Accuracy:0.78%
8x42 --> Accuracy:0.78%
8x43 --> Accuracy:0.78%
8x44 --> Accuracy:0.78%
8x45 --> Accuracy:0.78%
8x46 --> Accuracy:0.78%
8x47 --> Accuracy:0.78%
8x48 --> Accuracy:0.78%
8x49 --> Accuracy:0.78%
8x50 --> Accuracy:0.78%


In [37]:
model = RandomForestClassifier()
model.fit(X_train, y)
(model.predict(X_test) == y_test).mean()

np.float64(0.76)

## Wsnioski
Należałoby się mocniej przyjrzeć zależnościom między ważnymi cechami, część zapewno odrzucić a może utworzyć nowe?