In [54]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [89]:
df = pd.read_csv("test.csv")

In [90]:
df = df.drop("Name", axis=1)

In [None]:
def estimate_model(my_model, x_train, x_test, y_train, y_test):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

In [51]:
def predict_by_catboost(column):
    mydf = df.copy().drop("PassengerId", axis=1)
    mydf = mydf.drop("Transported", axis=1)
    data_x, data_y = mydf[mydf[column].isna() == False].drop(column, axis=1), mydf[mydf[column].isna() == False][column]
    #print(data_x, data_y)
    vectorizer = TfidfVectorizer()
    data_x = vectorizer.fit_transform(data_x).toarray()
    print(data_x)
    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y)
    model = CatBoostClassifier()
    param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'depth': [3, 5, 7]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_result = grid_search.fit(x_train, y_train)
    estimate_model(grid_result)
    need_predict = mydf[mydf[column].isna()].drop(column, axis=1)
    pred = grid_search.best_estimator_.predict(need_predict)
    pred = vectorizer.fit_transform(pred).toarray()
    print(pred)
    mydf[mydf[column].isna() == False][column] = pred
    return mydf




**Predict Crysleep**

In [52]:
newdf = predict_by_catboost("CryoSleep")

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


ValueError: Found input variables with inconsistent numbers of samples: [11, 8476]

**Prepare data**

In [82]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor, StackingClassifier, StackingRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

def predict_nulls(df):
    # Create new columns based on Cabin
    df['Deck'], df['Num'], df['Side'] = df['Cabin'].str.split('/', 2).str
    df = df.drop(['Cabin'], axis=1)

    # Encode categorical variables
    le = preprocessing.LabelEncoder()
    df['HomePlanet'] = df['HomePlanet'].astype(str)
    df['HomePlanet'] = le.fit_transform(df['HomePlanet'])

    df['Destination'] = df['Destination'].astype(str)
    df['Destination'] = le.fit_transform(df['Destination'])

    df['CryoSleep'] = df['CryoSleep'].astype(str)
    df['CryoSleep'] = le.fit_transform(df['CryoSleep'])

    df['VIP'] = df['VIP'].astype(str)
    df['VIP'] = le.fit_transform(df['VIP'])

    df['Deck'] = df['Deck'].astype(str)
    df['Deck'] = le.fit_transform(df['Deck'])

    df['Side'] = df['Side'].astype(str)
    df['Side'] = le.fit_transform(df['Side'])

    # Define the base models and the meta learner for the stacking classifiers/regressors
    base_models = [
        ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('hgbc', HistGradientBoostingClassifier())
    ]
    meta_learner = LogisticRegression()

    base_models_reg = [
        ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
        ('knn', KNeighborsRegressor()),
        ('hgbr', HistGradientBoostingRegressor())
    ]
    meta_learner_reg = LinearRegression()

    # Loop over columns to fill null values
    for column in df.columns:
        print(col)
        # Do not try to predict nulls for 'Transported' or 'PassengerId'
        if column not in ['Transported', 'PassengerId']:
            # Check if column has null values
            if df[column].isnull().sum() > 0:
                # Split data into sets with known and unknown column values
                known = df[df[column].notnull()]
                unknown = df[df[column].isnull()]

                # Split features and labels
                y_known = known[column]
                X_known = known.drop([column], axis=1)

                # Check if the column is a float (and therefore a regression problem)
                if df[column].dtype == 'float64':
                    # Initialize and fit a StackingRegressor
                    stacking_reg = StackingRegressor(estimators=base_models_reg, final_estimator=meta_learner_reg)
                    stacking_reg.fit(X_known, y_known)

                    # Predict missing values and fill in the original dataframe
                    df.loc[(df[column].isnull()), column] = stacking_reg.predict(unknown.drop([column], axis=1))

                else:  # The problem is a classification one
                    stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_learner)
                    stacking_clf.fit(X_known, y_known)

                    # Predict missing values and fill in the original dataframe
                    df.loc[(df[column].isnull()), column] = stacking_clf.predict((unknown.drop([column], axis=1))

    return df


In [93]:
# Assuming your DataFrame is named 'df'
predicted_df = predict_nulls(df)


In [94]:
predicted_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
0,0013_01,0,1,2,27.000000,0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,0018_01,0,0,2,19.000000,0,0.0,9.0,0.0,2823.0,0.0,0,1,0
2,0019_01,1,1,0,31.000000,0,0.0,0.0,0.0,0.0,0.0,0,1,0
3,0021_01,1,0,2,38.000000,0,0.0,6652.0,0.0,181.0,585.0,0,1,0
4,0023_01,0,0,2,20.000000,0,10.0,0.0,635.0,0.0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,0,1,2,34.000000,0,0.0,0.0,0.0,0.0,0.0,0,1,0
4273,9269_01,0,0,2,42.000000,0,0.0,847.0,17.0,10.0,144.0,0,1,0
4274,9271_01,2,1,0,25.030415,0,0.0,0.0,0.0,0.0,0.0,0,1,0
4275,9273_01,1,0,3,41.120857,0,0.0,2680.0,0.0,0.0,523.0,0,1,0


In [95]:
predicted_df.to_csv("features/Test_data_gpt3predcited.csv")