# Import Packages

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
import pickle

# Load and process the data

In [4]:
# Load the dataset
data = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the path to your dataset file

# Drop the ID column as it doesn't provide any useful information for prediction
data = data.drop('ID', axis=1)

# Split the data into features (X) and target (y)
X = data.drop('smoking', axis=1)
y = data['smoking']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Constructing preprocessing pipelines
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

# Identify numerical and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns

# Create preprocessor using make_column_transformer
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features)
)

# MVP Logistic Regression and Decision Tree Models

In [5]:
# Construct pipelines for logistic regression and decision tree models
logistic_pipeline = make_pipeline(preprocessor, LogisticRegression())
decision_tree_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier())

# Train logistic regression model
logistic_pipeline.fit(X_train, y_train)

# Train decision tree model
decision_tree_pipeline.fit(X_train, y_train)

In [6]:
# Save trained pipelines using pickle
with open('logistic_pipeline.pkl', 'wb') as f:
    pickle.dump(logistic_pipeline, f)

with open('decision_tree_pipeline.pkl', 'wb') as f:
    pickle.dump(decision_tree_pipeline, f)

In [7]:
# Evaluate models
logistic_accuracy = logistic_pipeline.score(X_test, y_test)
decision_tree_accuracy = decision_tree_pipeline.score(X_test, y_test)

print("Logistic Regression Model Accuracy:", logistic_accuracy)
print("Decision Tree Model Accuracy:", decision_tree_accuracy)

Logistic Regression Model Accuracy: 0.7410898644402549
Decision Tree Model Accuracy: 0.7769099560104139


In [8]:
# Cross-validate Logistic Regression Pipeline
logistic_cv_scores = cross_val_score(logistic_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Logistic Regression Cross-Validation Mean Accuracy:", logistic_cv_scores.mean())

# Cross-validate Decision Tree Pipeline
decision_tree_cv_scores = cross_val_score(decision_tree_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print("Decision Tree Cross-Validation Mean Accuracy:", decision_tree_cv_scores.mean())

Logistic Regression Cross-Validation Mean Accuracy: 0.7469752475565516
Decision Tree Cross-Validation Mean Accuracy: 0.7666376076378695
