In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

def process_data(file_path, target_column):
    # Load the dataset
    data = pd.read_excel(file_path)
    

    data = data.drop(columns=['Class'])
    # Display basic information about the dataset
    print("Dataset head:")
    print(data.head())
    print("\nDataset info:")
    print(data.info())

    # Handling missing values
    data = data.fillna(data.mean())  # Filling numeric missing values with mean

    # Encoding categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    
    # Separate features and target variable
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    
    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    print("\nData processing complete.")
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    
    return X_train, X_test, y_train, y_test

# Example usage
if __name__ == "__main__":
    file_path = 'Dry_Bean_Dataset.xlsx'  # Replace with your dataset file path
    target_column = 'Compactness'  # Replace with your target column name
    
    X_train, X_test, y_train, y_test = process_data(file_path, target_column)


Dataset head:
    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272750  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   0.947849   
3      0.498616       30724     195.467062  0.782681  0.976696   0.903936   
4      0.333680       30417     195.896503  0.773098  0.990893   0.984877   

   Compactness  ShapeFactor1  ShapeFactor2  ShapeFactor3  ShapeFactor4  
0     0.913358      0.0