In [2]:
import pandas as pd    #load the data, statistics
import seaborn as sns   #visualize the data

from sklearn.preprocessing import LabelEncoder #ordinal
from sklearn.preprocessing import OneHotEncoder #nominal, categorical
from sklearn.compose import ColumnTransformer


train_df = pd.read_csv('input/train.csv')
#sex
#age under 7
#age over 60
#pclass

# Creating new categories for age
train_df['<7 yrs'] = train_df['Age'].apply(lambda x: 1 if x < 7 else 0)
train_df['>60 yrs'] = train_df['Age'].apply(lambda x: 1 if x > 60 else 0)

selected_columns = train_df.drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)
print(selected_columns)


onehot_encoder = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Sex', 'Pclass'])
    ], remainder='passthrough')


# Define the features (X) and target (y)
X = selected_columns[['Sex', 'Pclass', '<7 yrs', '>60 yrs']]
y = selected_columns['Survived']

#checking the preprocessor
'''
X_transformed = preprocessor.fit_transform(X)

# Get the feature names for the transformed data
onehot_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(['Sex', 'Pclass'])
all_feature_names = list(onehot_feature_names) + ['<7 yrs', '>60 yrs']

# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=all_feature_names)
print(X_transformed_df)
'''

#test set
test_df = pd.read_csv('input/test.csv')

test_df['<7 yrs'] = test_df['Age'].apply(lambda x: 1 if x < 7 else 0)
test_df['>60 yrs'] = test_df['Age'].apply(lambda x: 1 if x > 60 else 0)

test_df = test_df.drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)
print(test_df)

X_test = test_df[['Sex', 'Pclass', '<7 yrs', '>60 yrs']]


     PassengerId  Survived  Pclass     Sex  <7 yrs  >60 yrs
0              1         0       3    male       0        0
1              2         1       1  female       0        0
2              3         1       3  female       0        0
3              4         1       1  female       0        0
4              5         0       3    male       0        0
..           ...       ...     ...     ...     ...      ...
886          887         0       2    male       0        0
887          888         1       1  female       0        0
888          889         0       3  female       0        0
889          890         1       1    male       0        0
890          891         0       3    male       0        0

[891 rows x 6 columns]
     PassengerId  Pclass     Sex  <7 yrs  >60 yrs
0            892       3    male       0        0
1            893       3  female       0        0
2            894       2    male       0        1
3            895       3    male       0        0
4     

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

#logistic regression
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

pipeline_lr.fit(X, y)

predictions = pipeline_lr.predict(X_test)
print(predictions)

# Save the predictions to a CSV file
lr_submission = pd.DataFrame(test_df[['PassengerId']])
lr_submission['Survived'] = predictions

print(lr_submission)

lr_submission.to_csv('output/lr_submission1.csv', index=False)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

#decision tree
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Sex', 'Pclass'])
    ], remainder='passthrough')

# Create a pipeline that first preprocesses the data and then fits a decision tree model
pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier())])

pipeline_dt.fit(X, y)

predictions = pipeline_dt.predict(X_test)
print(predictions)

# Save the predictions to a CSV file
dt_submission = pd.DataFrame(test_df[['PassengerId']])
dt_submission['Survived'] = predictions

print(dt_submission)

dt_submission.to_csv('output/dt_submission1.csv', index=False)


[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 0 1 0 1 0 0 1 0 0 0]
     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896