In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv("atlantis_citizens_final.csv")
df.head()


In [None]:
X = df.drop(columns=["Occupation"])
y = df["Occupation"]

In [None]:
num_columns = X.select_dtypes(include=['int64', 'float64']).columns
cat_columns = X.select_dtypes(include=['object']).columns

In [None]:
transform_numerical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

transform_categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", transform_numerical, num_columns),
        ("cat", transform_categorical, cat_columns)
    ]
)

In [None]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

In [None]:
model.fit(X, y)
y_pred_lr = model.predict(X)

In [None]:
test_df = pd.read_csv("test_atlantis_hidden.csv")
X_test = test_df.drop(columns=["Occupation"])
test_pred = model.predict(X_test)
output = pd.DataFrame({
    "id": test_df["Citizen_ID"],
    "Occupation": test_pred
})

output.to_csv("output.csv", index=False)