In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np

class LogisticRegressionModel:
    def __init__(self, csv_file, target_column, test_size=0.2, random_state=42):
        self.data = pd.read_csv(csv_file)
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state

    def preprocess_data(self):
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        
        ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
        X_encoded = ohe.fit_transform(X.select_dtypes(include=['object']))
        X = np.hstack((X.select_dtypes(exclude=['object']).values, X_encoded))
        
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_and_evaluate(self):
        X_train, X_test, y_train, y_test = self.preprocess_data()

        model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        results = {
            "accuracy": accuracy_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred, average='weighted')
        }
        
        return results

class LinearRegressionModel:
    def __init__(self, csv_file, target_column, test_size=0.2, random_state=42):
        self.data = pd.read_csv(csv_file)
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state

    def preprocess_data(self):
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        
        ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
        X_encoded = ohe.fit_transform(X.select_dtypes(include=['object']))
        X = np.hstack((X.select_dtypes(exclude=['object']).values, X_encoded))
        
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_and_evaluate(self):
        X_train, X_test, y_train, y_test = self.preprocess_data()

        model = LinearRegression()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        results = {
            "mean_squared_error": mean_squared_error(y_test, y_pred),
            "r2_score": r2_score(y_test, y_pred)
        }
        
        return results

def main():
    # Step 1: Get the CSV file path from the user
    csv_file = input("Enter the path to the CSV file: ").strip()
    
    # Step 2: Get the target column name from the user
    target_column = input("Enter the name of the target column: ").strip()
    
    # Step 3: Choose the model type
    model_type = input("Enter 'logistic' for Logistic Regression or 'linear' for Linear Regression: ").strip().lower()
    
    if model_type == 'logistic':
        model = LogisticRegressionModel(csv_file, target_column)
        results = model.train_and_evaluate()
        
        # Print results for Logistic Regression
        print(f"Accuracy: {results['accuracy']}")
        print(f"F1 Score: {results['f1_score']}")
    
    elif model_type == 'linear':
        model = LinearRegressionModel(csv_file, target_column)
        results = model.train_and_evaluate()
        
        # Print results for Linear Regression
        print(f"Mean Squared Error: {results['mean_squared_error']}")
        print(f"R2 Score: {results['r2_score']}")
    
    else:
        print("Invalid model type. Please enter 'logistic' or 'linear'.")

if __name__ == "__main__":
    main()


Enter the path to the CSV file:  covid.csv
Enter the name of the target column:  has_covid
Enter 'logistic' for Logistic Regression or 'linear' for Linear Regression:  linear


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

class LogisticRegressionModel:
    def __init__(self, csv_file, target_column, test_size=0.2, random_state=42):
        self.data = pd.read_csv(csv_file)
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state

    def preprocess_data(self):
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        
        # Impute missing values for numeric columns
        numeric_imputer = SimpleImputer(strategy='mean')
        X_numeric = X.select_dtypes(exclude=['object'])
        X_numeric = numeric_imputer.fit_transform(X_numeric)
        
        # Impute missing values for categorical columns
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        X_categorical = X.select_dtypes(include=['object'])
        X_categorical = categorical_imputer.fit_transform(X_categorical)
        
        # One-hot encode categorical columns
        ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
        X_encoded = ohe.fit_transform(X_categorical)
        
        # Combine numeric and encoded categorical columns
        X = np.hstack((X_numeric, X_encoded))
        
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_and_evaluate(self):
        X_train, X_test, y_train, y_test = self.preprocess_data()

        model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        results = {
            "accuracy": accuracy_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred, average='weighted')
        }
        
        return results

class LinearRegressionModel:
    def __init__(self, csv_file, target_column, test_size=0.2, random_state=42):
        self.data = pd.read_csv(csv_file)
        self.target_column = target_column
        self.test_size = test_size
        self.random_state = random_state

    def preprocess_data(self):
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        
        # Impute missing values for numeric columns
        numeric_imputer = SimpleImputer(strategy='mean')
        X_numeric = X.select_dtypes(exclude=['object'])
        X_numeric = numeric_imputer.fit_transform(X_numeric)
        
        # Impute missing values for categorical columns
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        X_categorical = X.select_dtypes(include=['object'])
        X_categorical = categorical_imputer.fit_transform(X_categorical)
        
        # One-hot encode categorical columns
        ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
        X_encoded = ohe.fit_transform(X_categorical)
        
        # Combine numeric and encoded categorical columns
        X = np.hstack((X_numeric, X_encoded))
        
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_and_evaluate(self):
        X_train, X_test, y_train, y_test = self.preprocess_data()

        model = LinearRegression()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        results = {
            "mean_squared_error": mean_squared_error(y_test, y_pred),
            "r2_score": r2_score(y_test, y_pred)
        }
        
        return results

def main():
    # Step 1: Get the CSV file path from the user
    csv_file = input("Enter the path to the CSV file: ").strip()
    
    # Step 2: Get the target column name from the user
    target_column = input("Enter the name of the target column: ").strip()
    
    # Step 3: Choose the model type
    model_type = input("Enter 'logistic' for Logistic Regression or 'linear' for Linear Regression: ").strip().lower()
    
    if model_type == 'logistic':
        model = LogisticRegressionModel(csv_file, target_column)
        results = model.train_and_evaluate()
        
        # Print results for Logistic Regression
        print(f"Accuracy: {results['accuracy']}")
        print(f"F1 Score: {results['f1_score']}")
    
    elif model_type == 'linear':
        model = LinearRegressionModel(csv_file, target_column)
        results = model.train_and_evaluate()
        
        # Print results for Linear Regression
        print(f"Mean Squared Error: {results['mean_squared_error']}")
        print(f"R2 Score: {results['r2_score']}")
    
    else:
        print("Invalid model type. Please enter 'logistic' or 'linear'.")

if __name__ == "__main__":
    main()


Enter the path to the CSV file:  covid.csv
Enter the name of the target column:  has_covid
Enter 'logistic' for Logistic Regression or 'linear' for Linear Regression:  logistic


Accuracy: 0.45
F1 Score: 0.4403508771929824
