In [171]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [503]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [504]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [505]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [506]:
def data_preprocessing(dataset, name):
    dataset = dataset.drop(["Name", "Ticket", "Cabin"], axis="columns")
    dataset = dataset
    dataset.columns = dataset.columns.str.lower()
    dataset["age"] = dataset["age"].fillna(dataset["age"].mean())
    if name == "df_train":
        dataset = dataset.dropna()
        dataset = dataset.iloc[:, [0,2,3,4,5,6,7,8,1]]
    else:
        dataset["fare"] = dataset["fare"].fillna(dataset["fare"].mean())
    return dataset

In [507]:
#df_train1 mit sibsp und parch als getrennte columns
#df_train2 mit alone column, wo sibsp und parch zusammengefasst wurden

In [508]:
df_train1 = data_preprocessing(df_train, "df_train")
df_train2 = data_preprocessing(df_train, "df_train")
df_test1 = data_preprocessing(df_test, "df_test")
df_test1

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,embarked
0,892,3,male,34.50000,0,0,7.8292,Q
1,893,3,female,47.00000,1,0,7.0000,S
2,894,2,male,62.00000,0,0,9.6875,Q
3,895,3,male,27.00000,0,0,8.6625,S
4,896,3,female,22.00000,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
413,1305,3,male,30.27259,0,0,8.0500,S
414,1306,1,female,39.00000,0,0,108.9000,C
415,1307,3,male,38.50000,0,0,7.2500,S
416,1308,3,male,30.27259,0,0,8.0500,S


In [509]:
df_train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  889 non-null    int64  
 1   pclass       889 non-null    int64  
 2   sex          889 non-null    object 
 3   age          889 non-null    float64
 4   sibsp        889 non-null    int64  
 5   parch        889 non-null    int64  
 6   fare         889 non-null    float64
 7   embarked     889 non-null    object 
 8   survived     889 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB


In [510]:
df_train2["alone"] = np.where((df_train2["sibsp"] >= 1) | (df_train2["parch"] >= 1),0,1)
df_train2 = df_train2.drop(["sibsp", "parch"], axis="columns")
df_train2 = df_train2.iloc[:, [0,1,2,3,4,5,7,6]]
df_test2 = df_test1.copy()
df_test2
df_test2["alone"] = np.where((df_test2["sibsp"] >= 1) | (df_test2["parch"] >= 1),0,1)
df_test2 = df_test2.drop(["sibsp", "parch"], axis="columns")
df_test2

Unnamed: 0,passengerid,pclass,sex,age,fare,embarked,alone
0,892,3,male,34.50000,7.8292,Q,1
1,893,3,female,47.00000,7.0000,S,0
2,894,2,male,62.00000,9.6875,Q,1
3,895,3,male,27.00000,8.6625,S,1
4,896,3,female,22.00000,12.2875,S,0
...,...,...,...,...,...,...,...
413,1305,3,male,30.27259,8.0500,S,1
414,1306,1,female,39.00000,108.9000,C,1
415,1307,3,male,38.50000,7.2500,S,1
416,1308,3,male,30.27259,8.0500,S,1


In [511]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [512]:
preprocessor = ColumnTransformer(
    transformers=[
        ("one-hot", OneHotEncoder(), ["sex", "embarked"]),
        ("scaler", StandardScaler(), ["age", "fare"])
    ],
    remainder="passthrough")

In [513]:
from sklearn.model_selection import train_test_split

In [514]:
X1 = df_train1.iloc[:, :-1]
y1 = df_train1.iloc[:, -1]

X2 = df_train2.iloc[:, :-1]
y2 = df_train2.iloc[:, -1]

In [515]:
X_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)
X_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [516]:
X_train1

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,embarked
708,709,1,female,22.000000,0,0,151.5500,S
240,241,3,female,29.699118,1,0,14.4542,C
382,383,3,male,32.000000,0,0,7.9250,S
792,793,3,female,29.699118,8,2,69.5500,S
683,684,3,male,14.000000,5,2,46.9000,S
...,...,...,...,...,...,...,...,...
107,108,3,male,29.699118,0,0,7.7750,S
271,272,3,male,25.000000,0,0,0.0000,S
862,863,1,female,48.000000,0,0,25.9292,S
436,437,3,female,21.000000,2,2,34.3750,S


In [517]:
X_trans1= preprocessor.fit_transform(X_train1)
X_trans2= preprocessor.fit_transform(X_train2)
x_test_trans1= preprocessor.fit_transform(x_test1)
x_test_trans2= preprocessor.fit_transform(x_test2)
df_test1 = preprocessor.fit_transform(df_test1)
df_test2 = preprocessor.fit_transform(df_test2)

In [518]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [519]:
pipe = Pipeline([  # Platzhalter für den Preprocessor
    ('model', 'passthrough')  # Platzhalter für das Modell
])

In [520]:
param_grid = [
    {"model": [RandomForestClassifier()],
     "model__n_estimators": [10, 50, 100],
     "model__max_depth": [None, 10, 20]},
    
    {"model": [KNeighborsClassifier()],
     "model__n_neighbors": [3, 5, 8]},
    
    {"model": [SVC()],
     "model__C": [1, 5, 10],
     "model__kernel": ["linear", "poly"],
     "model__degree": [3, 5, 8]},
    
    {"model": [LogisticRegression(solver='liblinear')],
     "model__penalty": ["l1", "l2"]}
]

# Erstellen des GridSearchCV-Objekts
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="accuracy")

In [521]:
grid_search.fit(X_trans1, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('model', 'passthrough')]),
             n_jobs=-1,
             param_grid=[{'model': [RandomForestClassifier(max_depth=10,
                                                           n_estimators=50)],
                          'model__max_depth': [None, 10, 20],
                          'model__n_estimators': [10, 50, 100]},
                         {'model': [KNeighborsClassifier()],
                          'model__n_neighbors': [3, 5, 8]},
                         {'model': [SVC()], 'model__C': [1, 5, 10],
                          'model__degree': [3, 5, 8],
                          'model__kernel': ['linear', 'poly']},
                         {'model': [LogisticRegression(solver='liblinear')],
                          'model__penalty': ['l1', 'l2']}],
             scoring='accuracy')

In [254]:
print("Beste Parameter:", grid_search.best_params_)
print("Bester Score:", grid_search.best_score_)

Beste Parameter: {'model': RandomForestClassifier(max_depth=10), 'model__max_depth': 10, 'model__n_estimators': 100}
Bester Score: 0.841091303063134


In [522]:
best_estimator = grid_search.best_estimator_
predictions = best_estimator.predict(x_test_trans1)

In [523]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7696629213483146

In [524]:
solution = best_estimator.predict(df_test1)

In [525]:
solution = pd.DataFrame({"PassengerId": df_test["PassengerId"], "Survived": solution})

In [526]:
solution

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [527]:
solution = solution.set_index("PassengerId")
solution

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [533]:
solution.to_csv("NEW_solution.csv")

In [535]:
grid_search.fit(X_trans2, y_train2)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('model', 'passthrough')]),
             n_jobs=-1,
             param_grid=[{'model': [RandomForestClassifier(max_depth=10,
                                                           n_estimators=50)],
                          'model__max_depth': [None, 10, 20],
                          'model__n_estimators': [10, 50, 100]},
                         {'model': [KNeighborsClassifier()],
                          'model__n_neighbors': [3, 5, 8]},
                         {'model': [SVC()], 'model__C': [1, 5, 10],
                          'model__degree': [3, 5, 8],
                          'model__kernel': ['linear', 'poly']},
                         {'model': [LogisticRegression(solver='liblinear')],
                          'model__penalty': ['l1', 'l2']}],
             scoring='accuracy')

In [536]:
print("Beste Parameter:", grid_search.best_params_)
print("Bester Score:", grid_search.best_score_)

Beste Parameter: {'model': RandomForestClassifier(max_depth=10, n_estimators=50), 'model__max_depth': 10, 'model__n_estimators': 50}
Bester Score: 0.8242194425293017


In [538]:
best_estimator2 = grid_search.best_estimator_
predictions2 = best_estimator2.predict(x_test_trans2)

In [539]:
accuracy_score(y_test2, predictions2)

0.7865168539325843

In [550]:
solution2 = best_estimator2.predict(df_test2)

In [551]:
solution2 = pd.DataFrame({"PassengerId": df_test["PassengerId"], "Survived": solution2})

In [552]:
solution2 = solution2.set_index("PassengerId")
solution2.to_csv("NEW_solution2.csv")