In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearnex import patch_sklearn,config_context
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder,FunctionTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.cluster import KMeans

In [4]:
train_df = pd.read_csv('train.csv')

In [5]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
def percent_missing(df):
    missing = df.isna().sum().sort_values(ascending=False)
    missing = missing[missing > 0]
    return missing * 100 /len(df)

In [7]:
percent_missing(train_df)

CryoSleep       2.496261
ShoppingMall    2.392730
VIP             2.335212
HomePlanet      2.312205
Name            2.300702
Cabin           2.289198
VRDeck          2.162660
FoodCourt       2.105142
Spa             2.105142
Destination     2.093639
RoomService     2.082135
Age             2.059128
dtype: float64

In [8]:
X = train_df.drop("Transported",axis=1)
y = train_df["Transported"]

In [9]:
X["CryoSleep"] = X["CryoSleep"].astype("str")
X["VIP"] = X["VIP"].astype("str")


In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [11]:
cat_columns = X.select_dtypes(include=['object',"category"]).columns
num_columns = X.select_dtypes(exclude=['object',"category"]).columns

In [12]:
numeric_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy="constant",fill_value=0)),
    ('scaler',StandardScaler())
])
categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy="constant",fill_value="Missing")),
    ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False,min_frequency=10,max_categories=10)),
])


In [13]:
preprocessor = ColumnTransformer([
    ("num",numeric_transformer,num_columns),
    ("cat",categorical_transformer,cat_columns),
], remainder="passthrough")

In [14]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

In [16]:
models ={
    "LogisticRegression": LogisticRegression(max_iter=10000,solver="saga"),
    "RandomForest":RandomForestClassifier(random_state=42,n_jobs=-2),
    "AdaBoost":AdaBoostClassifier(random_state=42),
    "GradientBoosting":GradientBoostingClassifier(random_state=42),
    "KNN":KNeighborsClassifier(),
    "SVC":SVC(random_state=42),
} 

In [17]:
param_grids = {
    "LogisticRegression":{
        "penalty" : ['elasticnet'],
        "l1_ratio" : [0.05,0.1,0.2,0.4,0.8],
        "C":[0.1,0.8,1,2,10],
    },
    "RandomForest":{
        "n_estimators":[68,100,150,200,300,350,400,450,500,600],
        # "min_samples_split":[2,4,6,8,10],
        "min_samples_split":[2,4,6,10,12,15],
        # "min_samples_leaf":[1,2,4,6,8,10],
        "min_samples_leaf":[2,4,5,6,7],
    },
    "AdaBoost":{
        "n_estimators":[100,400,500,600,700,750,800],
        "learning_rate":[0.001,0.01,0.05,0.1],
        # "algorithm":["SAMME.R","SAMME"],
    },
    "GradientBoosting":{
        "n_estimators":[80,100,120,140,150,160,170,200,300,400,500,600],
        # "learning_rate":[0.001,0.01,0.03,0.05,0.08,0.1,1],
        "learning_rate":[0.03,0.05,0.07,0.08,0.09,0.1,1],
        # "max_depth":[2,3,4,5,6,10],
        "max_depth":[3,4,5,6,7,8],
    },
    "KNN":{
        "n_neighbors":np.arange(2,50),
    },
    "SVC":{
        "C":[0.01,0.1,1,7,10,15],
        # "kernel":['linear','poly','rbf'],
        "gamma":["scale","auto"],
    }
}

In [18]:
cv = KFold(n_splits=5, random_state=42, shuffle=True)

In [30]:
grids = {}
for model_name,model in models.items():
    grids[model_name] = GridSearchCV(model,param_grids[model_name],cv=cv,n_jobs=-2,scoring='accuracy')
    grids[model_name].fit(X_train_preprocessed,y_train)
    print(f"{model_name} best score : {grids[model_name].best_score_}")
    print(f"{model_name} best params : {grids[model_name].best_params_}")

KeyboardInterrupt: 

In [19]:
def custom_features(df):
    df_out = df.copy()
    df_mean_encode = pd.concat([X_train,y_train],axis=1)
    df_out["Deck"] = df_out["Cabin"].apply(lambda x: str(x)[:1])
    df_out["Side"] = df_out["Cabin"].apply(lambda x: str(x)[-1:])
    df_out["CabinNumber"] = df_out["Cabin"].apply(lambda x: str(x)[2:-2])
    df_out["CabinNumber"] = df_out["CabinNumber"].apply(lambda x:int(x) if x.isdigit() else np.nan)
    df_out["CabinNum_0_300"] = (df_out["CabinNumber"] < 300).astype(str)
    df_out["CabinNum_300_600"] = ((df_out["CabinNumber"] > 300) & (df_out["CabinNumber"] < 600)).astype(str)
    df_out["CabinNum_600_900"] = ((df_out["CabinNumber"] > 600) & (df_out["CabinNumber"] < 900)).astype(str)
    df_out["CabinNum_900_1200"] = ((df_out["CabinNumber"] > 900) & (df_out["CabinNumber"] < 1200)).astype(str)
    df_out["CabinNum_1200_1500"] = ((df_out["CabinNumber"] > 1200) & (df_out["CabinNumber"] < 1500)).astype(str)
    df_out["CabinNum_1500_1800"] = ((df_out["CabinNumber"] > 1500) & (df_out["CabinNumber"] < 1800)).astype(str)
    
    df_out["DeckSideMeanEncoded"] = df_out["Deck"].apply(str)+"-"+df_out["Side"].apply(str)
    df_out["DeckSideMeanEncoded"] = df_out["DeckSideMeanEncoded"].apply(lambda x:"Missing" if "n" in x else x)
    df_mean_encode["Deck"] = df_mean_encode["Cabin"].apply(lambda x: str(x)[:1])
    df_mean_encode["Side"] = df_mean_encode["Cabin"].apply(lambda x: str(x)[-1:])
    df_mean_encode["DeckSideMeanEncoded"] = df_mean_encode["Deck"].apply(str)+"-"+df_mean_encode["Side"].apply(str)
    df_mean_encode["DeckSideMeanEncoded"] = df_mean_encode["DeckSideMeanEncoded"].apply(lambda x:"Missing" if "n" in x else x)
    dt = df_mean_encode.groupby("DeckSideMeanEncoded")["Transported"].mean()
    df_out["DeckSideMeanEncoded"] = df_out.merge(dt,on="DeckSideMeanEncoded",how="left")["Transported"].values
    
    df_out["DestinationHomePlanetMeanEncoded"] = df_out["Destination"].apply(str)+"-"+df_out["HomePlanet"].apply(str)
    df_out["DestinationHomePlanetMeanEncoded"] = df_out["DestinationHomePlanetMeanEncoded"].apply(lambda x:"Missing" if "nan" in x else x)
    df_mean_encode["DestinationHomePlanetMeanEncoded"] = df_mean_encode["Destination"].apply(str)+"-"+df_mean_encode["HomePlanet"].apply(str)
    df_mean_encode["DestinationHomePlanetMeanEncoded"] = df_mean_encode["DestinationHomePlanetMeanEncoded"].apply(lambda x:"Missing" if "nan" in x else x)
    dt = df_mean_encode.groupby("DestinationHomePlanetMeanEncoded")["Transported"].mean()
    df_out["DestinationHomePlanetMeanEncoded"] = df_out.merge(dt,on="DestinationHomePlanetMeanEncoded",how="left")["Transported"].values
    df_out.drop("Cabin",axis=1,inplace=True)
    return df_out

In [20]:
feature_transformer = FunctionTransformer(custom_features)

In [21]:
new_cat_columns = pd.Index(["Deck","Side","CabinNum_1500_1800","CabinNum_1200_1500","CabinNum_900_1200","CabinNum_600_900","CabinNum_300_600","CabinNum_0_300"])
new_num_columns = pd.Index(["DeckSideMeanEncoded","DestinationHomePlanetMeanEncoded","CabinNumber"])
cat_columns = X.drop("Cabin",axis=1).select_dtypes(include=['object',"category"]).columns.append(new_cat_columns)
num_columns = X.select_dtypes(exclude=['object',"category"]).columns.append(new_num_columns)

In [22]:
preprocessor = ColumnTransformer([
    ("num",numeric_transformer,num_columns),
    ("cat",categorical_transformer,cat_columns),
], remainder="passthrough")

In [23]:
preprocessor_fe = Pipeline([
    ("feature_transformer",feature_transformer),
    ("preprocessor",preprocessor)
])

In [24]:
X_train_preprocessed_fe = preprocessor_fe.fit_transform(X_train)
X_val_preprocessed_fe = preprocessor_fe.transform(X_val)
X_full_preprocessed_fe = preprocessor_fe.fit_transform(X)

In [25]:
inertais = []
for k in range(2,70):
    kmeans = KMeans(n_clusters=k,random_state=42)
    kmeans.fit(X_train_preprocessed_fe,y_train)
    inertais.append(kmeans.inertia_)
print(pd.Series(inertais).diff)

<bound method Series.diff of 0     77968.511262
1     71969.840321
2     67241.047332
3     63861.312871
4     60760.584644
          ...     
63    24887.478422
64    25170.421023
65    24935.057513
66    24560.195129
67    24693.546935
Length: 68, dtype: float64>


In [26]:
def get_kmeans_clusters(df):
    kmeans = KMeans(n_clusters=6,random_state=42)
    kmeans.fit(X_train_preprocessed_fe,y_train)
    clusters = kmeans.predict(df)
    clusters = np.reshape(clusters,(-1,1))
    df_out = np.append(df,clusters,axis=1)
    return df_out

In [27]:
X_train_preprocessed_fe_cluster = get_kmeans_clusters(X_train_preprocessed_fe)
X_val_preprocessed_fe_cluster = get_kmeans_clusters(X_val_preprocessed_fe)
X_full_preprocessed_fe_cluster = get_kmeans_clusters(X_full_preprocessed_fe)

In [40]:
grids = {}
for model_name,model in models.items():
    grids[model_name] = GridSearchCV(model,param_grids[model_name],cv=cv,n_jobs=-2,scoring='accuracy')
    grids[model_name].fit(X_train_preprocessed_fe_cluster,y_train)
    print(f"{model_name} best score : {grids[model_name].best_score_}")
    print(f"{model_name} best params : {grids[model_name].best_params_}")

LogisticRegression best score : 0.7976690854361802
LogisticRegression best params : {'C': 0.1, 'l1_ratio': 0.4, 'penalty': 'elasticnet'}
RandomForest best score : 0.8103224738684969
RandomForest best params : {'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 500}




AdaBoost best score : 0.7983879926971436
AdaBoost best params : {'learning_rate': 0.1, 'n_estimators': 750}
GradientBoosting best score : 0.8100360488029418
GradientBoosting best params : {'learning_rate': 0.08, 'max_depth': 5, 'n_estimators': 100}
KNN best score : 0.7832888714190401
KNN best params : {'n_neighbors': 9}
SVC best score : 0.8081664761648625
SVC best params : {'C': 1, 'gamma': 'scale'}


In [66]:
tuned_models = {
    "LogisticRegression": LogisticRegression(max_iter=10000,solver="saga",C=0.1,l1_ratio=0.4,penalty="elasticnet",random_state=42),
    "RandomForest":RandomForestClassifier(random_state=42,n_jobs=-2,n_estimators=600,min_samples_leaf=6,min_samples_split=2),    
    "AdaBoost":AdaBoostClassifier(random_state=42,n_estimators=750,learning_rate=0.1),
    "GradientBoosting":GradientBoostingClassifier(random_state=42,n_estimators=150,learning_rate=0.08,max_depth=5),
    "KNN":KNeighborsClassifier(n_neighbors=9),
    "SVC":SVC(random_state=42,gamma="scale",C=1),
    "StackingClassifier":StackingClassifier(estimators=[
        ('LogisticRegression', LogisticRegression(C=2, l1_ratio=0.8, max_iter=10000, penalty='elasticnet',random_state=42, solver='saga')), ('RandomForest', RandomForestClassifier(min_samples_leaf=6, min_samples_split=2,n_estimators=600, n_jobs=-2, random_state=42)), ('AdaBoost', AdaBoostClassifier(learning_rate=0.1, n_estimators=750, random_state=42)), ('GradientBoosting', GradientBoostingClassifier(learning_rate=0.08, max_depth=5, n_estimators=100,random_state=42)), ('KNN', KNeighborsClassifier(n_neighbors=19)), ('SVC', SVC(C=7, random_state=42))
    ],n_jobs=-2),
    "StackingClassifier2":StackingClassifier(estimators=[
         ('RandomForest', RandomForestClassifier(min_samples_leaf=6, min_samples_split=2,n_estimators=600, n_jobs=-2, random_state=42)),('GradientBoosting', GradientBoostingClassifier(learning_rate=0.08, max_depth=5, n_estimators=150,random_state=42)),('SVC', SVC(C=7, random_state=42))
    ],n_jobs=-2),
    "VotingClassifier" : VotingClassifier([
        ('SVC', SVC(C=1, random_state=42)) , ('RandomForest',RandomForestClassifier(min_samples_leaf=6, min_samples_split=2,n_estimators=600, n_jobs=-2, random_state=42)), ('GradientBoosting',GradientBoostingClassifier(learning_rate=0.08, max_depth=5,n_estimators=150,random_state=42))
    ],n_jobs=-2),
    "VotingClassifier2" : VotingClassifier([
        ('SVC', SVC(C=1, random_state=42)) ,('GradientBoosting',GradientBoostingClassifier(learning_rate=0.08, max_depth=5,n_estimators=150,random_state=42))
    ],n_jobs=-2)
}

In [67]:
grids = {}
for model_name,model in tuned_models.items():
    grids[model_name] = model
    grids[model_name].fit(X_train_preprocessed_fe_cluster,y_train)
    pred = grids[model_name].predict(X_val_preprocessed_fe_cluster)
    print(model_name)
    print(confusion_matrix(y_val,pred))
    print(classification_report(y_val,pred))

LogisticRegression
[[659 202]
 [170 708]]
              precision    recall  f1-score   support

       False       0.79      0.77      0.78       861
        True       0.78      0.81      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739

RandomForest
[[685 176]
 [181 697]]
              precision    recall  f1-score   support

       False       0.79      0.80      0.79       861
        True       0.80      0.79      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739





AdaBoost
[[638 223]
 [154 724]]
              precision    recall  f1-score   support

       False       0.81      0.74      0.77       861
        True       0.76      0.82      0.79       878

    accuracy                           0.78      1739
   macro avg       0.79      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739

GradientBoosting
[[674 187]
 [151 727]]
              precision    recall  f1-score   support

       False       0.82      0.78      0.80       861
        True       0.80      0.83      0.81       878

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739

KNN
[[684 177]
 [223 655]]
              precision    recall  f1-score   support

       False       0.75      0.79      0.77       861
        True       0.79      0.75      0.77       878

    accuracy                           0.77      1739
   macro avg       0.77      

In [70]:
test_df = pd.read_csv("test.csv")

In [71]:
test_df_preprocessed_fe = preprocessor_fe.transform(test_df)
test_df_preprocessed_fe_cluster =get_kmeans_clusters(test_df_preprocessed_fe)

In [72]:
# grids["VotingClassifier2"].fit(X_full_preprocessed_fe_cluster,y)
# test_df["Transported"] = grids["VotingClassifier2"].predict(test_df_preprocessed_fe_cluster)

In [33]:
# grids["StackingClassifier2"].fit(X_full_preprocessed_fe_cluster,y)
# test_df["Transported"] = grids["StackingClassifier2"].predict(test_df_preprocessed_fe_cluster)

In [55]:
grids["GradientBoosting"].fit(X_full_preprocessed_fe_cluster,y)
test_df["Transported"] = grids["GradientBoosting"].predict(test_df_preprocessed_fe_cluster)

In [73]:
submission = pd.DataFrame(test_df[["PassengerId","Transported"]])

In [74]:
# submission.to_csv("VotingClassifier_clustered_X_full_no_rf.csv",index=False)

In [None]:
submission.to_csv("XGBoost_clustered_X_full.csv",index=False)