In [1]:
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
import sklearn
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("inputs/train.csv")
test_data = pd.read_csv("inputs/test.csv")
train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:

## Preprocessing ###############################################################
################################################################################
features = ["Sex", "Age", 'Fare']
X_train = train_data[features]
Y_train = train_data['Survived']
X_train_cat = X_train.select_dtypes(include=[object])
X_train_num = X_train.select_dtypes(include=[np.number])
X_test = test_data[features]
female = train_data[train_data['Sex']=='female']
male   = train_data[train_data['Sex']=='male']

#
female_Survived = female['Survived']
male_Survived = male['Survived']
# print(sum(female_Survived)/len(female_Survived))
# print(sum(male_Survived)/len(male_Survived))

#= One Hot Encoding ===========================================================
from sklearn.preprocessing import OneHotEncoder
# cat_encoder = OneHotEncoder(handle_unknown='ignore')
# print(X_train_cat.head())
# cat_encoder.fit(X_train_cat)

# X_train_cat_prep = pd.DataFrame(cat_encoder.transform(X_train_cat).toarray(), columns=cat_encoder.get_feature_names_out())
# print(X_train_cat_prep.head())

#= Replace the missing value with median =======================================
from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy="median")
# X_train_num_prep = imputer.fit_transform(X_train_num)
# X_train_num_prep = pd.DataFrame(X_train_num_prep, columns=imputer.get_feature_names_out())
# print(X_train_num_prep.head())

#= Combine the two =============================================================
# X_train_prep = pd.concat([X_train_cat_prep, X_train_num_prep], axis=1)
# print(X_train_prep.head())


#= Pipeline ====================================================================
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
							 StandardScaler())

cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

# X_train_num_prep = num_pipeline.fit_transform(X_train)
# X_train_num_prep = pd.DataFrame(X_train_num_prep, columns=num_pipeline.get_feature_names_out())
# X_train_cat_prep = cat_pipeline.fit_transform(X_train).toarray()
# X_train_cat_prep = pd.DataFrame(X_train_cat_prep, columns=cat_pipeline.get_feature_names_out())
# X_train_num_prep.head()
# X_train_cat_prep.head()

preprocess_pipeline = make_column_transformer(
	(num_pipeline, make_column_selector(dtype_include=np.number)),
	(cat_pipeline, make_column_selector(dtype_include=object))
)

X_train_prep = preprocess_pipeline.fit_transform(X_train)
X_train_prep = pd.DataFrame(X_train_prep, columns=preprocess_pipeline.get_feature_names_out())
X_train_prep.head()

Unnamed: 0,pipeline-1__Age,pipeline-1__Fare,pipeline-2__Sex_female,pipeline-2__Sex_male
0,-0.565736,-0.502445,0.0,1.0
1,0.663861,0.786845,1.0,0.0
2,-0.258337,-0.488854,1.0,0.0
3,0.433312,0.42073,1.0,0.0
4,0.433312,-0.486337,0.0,1.0


In [4]:

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score 


print(cross_val_score(RandomForestClassifier(n_estimators=200), 
					  X_train_prep, Y_train, cv=5, scoring="accuracy"))
print(cross_val_score(VotingClassifier(estimators=[('lr', LogisticRegression()), 
												   ('sgd', SGDClassifier()),
												   ('svc', SVC()),
												   ('knn', KNeighborsClassifier()),
												   ('dt', DecisionTreeClassifier()),
												   ], 
												   voting='hard'), 
												   X_train_prep, Y_train, cv=5, scoring="accuracy"))

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_prep, Y_train)

X_test_prep = preprocess_pipeline.transform(X_test)
X_test_prep = pd.DataFrame(X_test_prep, columns=preprocess_pipeline.get_feature_names_out())
X_test_prep.head()

Y_test_pred = model.predict(X_test_prep)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': Y_test_pred})
print(output.head())
output.to_csv('submission.csv', index=False)

[0.75977654 0.78089888 0.80337079 0.76966292 0.80337079]
[0.81005587 0.81460674 0.76966292 0.75842697 0.79213483]
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         1
