In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load the training data
train_data = pd.read_csv('train.csv')

# Separate features and target variable
X = train_data.drop(['id', 'smoking'], axis=1)
y = train_data['smoking']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Initialize and train the model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict probabilities for validation set
val_preds = model.predict_proba(X_val)[:, 1]

# Evaluate the model using ROC AUC score
roc_auc = roc_auc_score(y_val, val_preds)
print(f'Validation ROC AUC: {roc_auc:.4f}')

# Load the test data
test_data = pd.read_csv('test.csv')

# Preprocess test data
X_test = test_data.drop('id', axis=1)
X_test = scaler.transform(X_test)

# Predict probabilities for test set
test_preds = model.predict_proba(X_test)[:, 1]

# Prepare submission file
submission = pd.DataFrame({'id': test_data['id'], 'smoking': test_preds})
submission.to_csv('submission.csv', index=False)

Validation ROC AUC: 0.8319


In [2]:
submission.head()

Unnamed: 0,id,smoking
0,159256,0.39457
1,159257,0.197513
2,159258,0.639809
3,159259,0.1269
4,159260,0.653465
