In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Read the train data
train_data = pd.read_csv('Train_Data.csv')

# Separate features (X) and target variable (y)
X_train = train_data.drop('Healthy', axis=1)
y_train = train_data['Healthy']

# Read the test data
test_data = pd.read_csv('Test_Data.csv')

# Preprocess the data

# Encode categorical variables
encoder = LabelEncoder()
for column in ['Specific ailments', 'Food preference', 'Smoker?', 'Living in?', 'Any heriditary condition?',
               'Follow Diet', 'Mental health management']:
    X_train[column] = encoder.fit_transform(X_train[column].astype(str))
    test_data[column] = encoder.transform(test_data[column].astype(str))

# Handle missing values in the train and test data
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model
f1 = f1_score(y_val, y_val_pred)
print("F1 score:", f1)

# Predict on the test data
y_test_pred = model.predict(test_data)

# Prepare submission
submission = pd.DataFrame({'predictions': y_test_pred})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)


F1 score: 0.8613381774144716
