In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
def preprocess_data(data):
    # Separate the features (X) and the target variable (y)
    X = data.drop('y', axis=1)
    y = data['y']

    # Handle NaN or 0 values in numeric columns
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
    X[numeric_cols] = X[numeric_cols].replace(0, np.nan)
    imputer = SimpleImputer(strategy='mean')
    X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

    # Convert text data to numeric representations
    text_cols = X.select_dtypes(include=['object']).columns
    for col in text_cols:
        label_encoder = LabelEncoder()
        X[col] = label_encoder.fit_transform(X[col])

    return X, y

In [3]:
def encode_text_column_using_label_encoding(df, column_name):
    """
    Encode a text column in a dataframe using label encoding.
    
    Parameters:
    df (pandas dataframe): The dataframe to encode
    column_name (str): The name of the text column to encode
    
    Returns:
    pandas dataframe: The updated dataframe with the encoded column
    """
    # Create a LabelEncoder object
    le = LabelEncoder()
    
    # Encode the column using label encoding
    encoded = le.fit_transform(df[column_name])
    
    # Replace the original column with the encoded values
    df[column_name] = encoded
    
    # Return the updated dataframe
    return df


In [None]:
# Read the data from the CSV file
data = pd.read_csv('data.csv')
column_names = data.columns.tolist()
print(column_names)

In [None]:
data = encode_text_column_using_label_encoding(data, "y")
X, y = preprocess_data(data)