# this code reads in data and performs model selection between logistics Reg, decision tree and random forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset (replace 'dataset.csv' with your actual dataset file)
data = pd.read_csv('dataset.csv')

# Assume the target variable is named 'target'
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize candidate models
logreg_model = LogisticRegression()
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()

# Perform cross-validation and compare models
logreg_scores = cross_val_score(logreg_model, X_train, y_train, cv=5, scoring='accuracy')
dt_scores = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='accuracy')
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

# Select the best model based on cross-validation scores
best_model = None
best_score = 0.0

if logreg_scores.mean() > best_score:
    best_model = logreg_model
    best_score = logreg_scores.mean()

if dt_scores.mean() > best_score:
    best_model = dt_model
    best_score = dt_scores.mean()

if rf_scores.mean() > best_score:
    best_model = rf_model
    best_score = rf_scores.mean()

# Train and evaluate the best model on the test set
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best Model:", best_model.__class__.__name__)
print("Test Accuracy:", accuracy)
