In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#Read The Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train.describe()

#Plot The Data

In [None]:

plt.figure(figsize=(10, 8))

plt.subplot(3, 2, 1)
sns.countplot(data=train, x='VIP', hue='Transported')
plt.title(" VIP vs Transported")

plt.subplot(3, 2, 2)
sns.countplot(data=train, x='CryoSleep', hue='Transported')
plt.title(" CryoSleep vs Transported")

plt.subplot(3, 2, 3)
sns.countplot(data=train, x='HomePlanet', hue='Transported')
plt.title(" HomePlanet vs Transported")

plt.subplot(3, 2, 4)
sns.countplot(data=train, x='Destination', hue='Transported')
plt.title(" Destination vs Transported")


plt.tight_layout()
plt.show()

In [None]:
train['Destination'].unique()

In [None]:
train['Destination'].isna().sum()

In [None]:
train['HomePlanet'].unique()

In [None]:
train['HomePlanet'].isna().sum()

In [None]:
train['CryoSleep'].isna().sum()

In [None]:
len(train['Cabin'].unique())

In [None]:
train['Cabin'].isna().sum()

In [None]:
train["Transported"].unique()

In [None]:
train["Transported"].isna().sum()

# Dealing with missed values

In [16]:
train.drop(["Name", "Cabin"], inplace=True, axis=1)
test.drop(["Name", "Cabin"], inplace=True, axis=1)

In [17]:
categorical_columns = train.select_dtypes(include=['object']).columns.drop('PassengerId')
numerical_columns = train.select_dtypes(include=["int32", 'float64']).columns

In [None]:
for col in categorical_columns:
    train[col].fillna("unknown", inplace=True)
    test[col].fillna("unknown", inplace=True)
    
for col in categorical_columns:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

In [None]:
for col in categorical_columns:
    print(train[col].unique())

### - Applying lebel encoder on categorical features

In [20]:
le = LabelEncoder()
for col in categorical_columns:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
for col in categorical_columns:
    print(train[col].unique())

### - Apply KNN imputer to fill numerical missed values 

In [22]:
imputer = KNNImputer(n_neighbors=5)

train[numerical_columns] = imputer.fit_transform(train[numerical_columns])
test[numerical_columns] = imputer.transform(test[numerical_columns])

# Spliting the train set to train/val sets to evaluate the models

In [23]:
X = train.drop(['PassengerId', 'Transported'], axis=1)
y = train['Transported']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=98)

In [None]:
X_train.head()

# Finding the best model

In [25]:
models = {
    "XGBoost": XGBClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Light GBM": LGBMClassifier()
}

In [None]:
for name, model in models.items():
    print(f'\n{name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f'\nAccuracy: {accuracy_score(y_val, y_pred):.3f}')
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

In [None]:
light_gbm_model = LGBMClassifier()


lgbm_grid_params = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300, 500],
    'boosting_type': ['gbdt'],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

lgbm_grid = GridSearchCV(estimator=light_gbm_model, param_grid=lgbm_grid_params, cv=5, n_jobs=-1, verbose=2)
lgbm_grid.fit(X_train, y_train)

In [361]:
best_model = lgbm_grid.best_estimator_

In [None]:
import joblib  
joblib.dump(best_model, "best_model_lgbm.pkl")

In [None]:
feature_importancity = best_model.feature_importances_
features = X_train.columns

plt.barh(features, feature_importancity, color="salmon")
plt.xlabel("Feature Importance")
plt.title("Light GBM Feature Importances")
plt.show()

In [None]:
y_pred = best_model.predict(X_val)

print(f"Best Light GBM model's accuracy: {accuracy_score(y_val, y_pred):.3f}")
print("\nClassification Report:")
print(f"\n{classification_report(y_val, y_pred)}")

# Training final model with the whole dataset.

In [365]:
best_params = lgbm_grid.best_params_

final_model = LGBMClassifier(**best_params)

In [None]:
final_model.fit(X,y)

In [None]:
joblib.dump(final_model, "final_model_lgbm.pkl")

In [371]:
X_test_submission = test.drop("PassengerId", axis=1)

In [372]:
y_pred_submission =  final_model.predict(X_test_submission)

In [373]:
submission = pd.DataFrame({"PassengerId": test['PassengerId'], "Transported": y_pred_submission})

In [375]:
submission.to_csv("submission.csv", index=False)