In [12]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

class DataPreprocessor:
    def __init__(self):
        self.scaler = None
        self.mean = None
        self.std = None
    
    def load_dataset(self, file_path, delimiter=','):
        """Load dataset from a file using numpy"""
        try:
            data = np.genfromtxt(file_path, delimiter=delimiter, dtype=None, encoding=None)
            print("Dataset loaded successfully!")
            print(f"Shape of dataset: {data.shape}")
            return data
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None
    
    def handle_missing_values(self, data, strategy='mean'):
        """
        Handle missing values in the dataset
        Strategies: 'mean', 'median', 'mode', 'zero', 'drop'
        """
        try:
            # Convert to float if not already
            if data.dtype != np.float64:
                data = data.astype(np.float64)
                
            if strategy == 'mean':
                col_mean = np.nanmean(data, axis=0)
                inds = np.where(np.isnan(data))
                data[inds] = np.take(col_mean, inds[1])
            elif strategy == 'median':
                col_median = np.nanmedian(data, axis=0)
                inds = np.where(np.isnan(data))
                data[inds] = np.take(col_median, inds[1])
            elif strategy == 'zero':
                data = np.nan_to_num(data, nan=0.0)
            elif strategy == 'drop':
                data = data[~np.isnan(data).any(axis=1)]
            else:
                raise ValueError("Invalid strategy for handling missing values")
                
            print(f"Missing values handled using '{strategy}' strategy")
            return data
        except Exception as e:
            print(f"Error handling missing values: {e}")
            return data
    
    def normalize_data(self, data, method='minmax'):
        """
        Normalize the dataset using specified method
        Methods: 'minmax', 'standard'
        """
        try:
            if method == 'minmax':
                self.scaler = MinMaxScaler()
                normalized_data = self.scaler.fit_transform(data)
            elif method == 'standard':
                self.mean = np.mean(data, axis=0)
                self.std = np.std(data, axis=0)
                normalized_data = (data - self.mean) / self.std
            else:
                raise ValueError("Invalid normalization method")
                
            print(f"Data normalized using '{method}' method")
            return normalized_data
        except Exception as e:
            print(f"Error normalizing data: {e}")
            return data
    
    def encode_categorical(self, data, column_index):
        """
        One-hot encode categorical variables
        """
        try:
            # Extract the categorical column
            categorical_col = data[:, column_index]
            
            # Get unique categories
            unique_categories = np.unique(categorical_col)
            
            # Create one-hot encoded columns
            encoded_cols = np.zeros((data.shape[0], len(unique_categories)))
            
            for i, category in enumerate(unique_categories):
                encoded_cols[:, i] = (categorical_col == category).astype(int)
            
            # Remove original column and append encoded columns
            data = np.delete(data, column_index, axis=1)
            data = np.hstack((data, encoded_cols))
            
            print(f"Categorical column at index {column_index} encoded successfully")
            return data
        except Exception as e:
            print(f"Error encoding categorical data: {e}")
            return data
    
    def split_dataset(self, data, target_column_index, test_size=0.2):
        """
        Split dataset into training and testing sets
        """
        try:
            # Shuffle the dataset
            np.random.shuffle(data)
            
            # Split features and target
            X = np.delete(data, target_column_index, axis=1)
            y = data[:, target_column_index]
            
            # Calculate split index
            split_idx = int((1 - test_size) * len(data))
            
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]
            
            print(f"Dataset split into training ({len(X_train)} samples) and testing ({len(X_test)} samples) sets")
            return X_train, X_test, y_train, y_test
        except Exception as e:
            print(f"Error splitting dataset: {e}")
            return None, None, None, None

# Example usage
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = DataPreprocessor()
    
    # Load dataset (replace with your actual file path)
    # Sample dataset should be in CSV format with numerical values
    data = preprocessor.load_dataset('dataset/HistoricalPrices.csv')
    
    if data is not None:
        # Handle missing values
        data = preprocessor.handle_missing_values(data, strategy='mean')
        
        # Normalize data
        data = preprocessor.normalize_data(data, method='standard')
        
        # If you have categorical data (uncomment and modify as needed)
        # data = preprocessor.encode_categorical(data, column_index=0)
        
        # Split dataset into features and target (assuming last column is target)
        X_train, X_test, y_train, y_test = preprocessor.split_dataset(
            data, target_column_index=-1, test_size=0.3)
        
        # Print shapes of resulting arrays
        print("\nFinal dataset shapes:")
        print(f"X_train: {X_train.shape}")
        print(f"X_test: {X_test.shape}")
        print(f"y_train: {y_train.shape}")
        print(f"y_test: {y_test.shape}")

Dataset loaded successfully!
Shape of dataset: (60, 5)
Error handling missing values: could not convert string to float: 'Date'
Error normalizing data: ufunc 'add' did not contain a loop with signature matching types (dtype('<U9'), dtype('<U9')) -> None
Dataset split into training (42 samples) and testing (18 samples) sets

Final dataset shapes:
X_train: (42, 4)
X_test: (18, 4)
y_train: (42,)
y_test: (18,)


In [2]:
def load_dataset(self, file_path, delimiter=','):
        """Load dataset from a file using numpy"""
        try:
            data = np.genfromtxt(file_path, delimiter=delimiter, dtype=None, encoding=None)
            print("Dataset loaded successfully!")
            print(f"Shape of dataset: {data.shape}")
            return data
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return None

In [3]:
def handle_missing_values(self, data, strategy='mean'):
        """
        Handle missing values in the dataset
        Strategies: 'mean', 'median', 'mode', 'zero', 'drop'
        """
        try:
            # Convert to float if not already
            if data.dtype != np.float64:
                data = data.astype(np.float64)
                
            if strategy == 'mean':
                col_mean = np.nanmean(data, axis=0)
                inds = np.where(np.isnan(data))
                data[inds] = np.take(col_mean, inds[1])
            elif strategy == 'median':
                col_median = np.nanmedian(data, axis=0)
                inds = np.where(np.isnan(data))
                data[inds] = np.take(col_median, inds[1])
            elif strategy == 'zero':
                data = np.nan_to_num(data, nan=0.0)
            elif strategy == 'drop':
                data = data[~np.isnan(data).any(axis=1)]
            else:
                raise ValueError("Invalid strategy for handling missing values")
                
            print(f"Missing values handled using '{strategy}' strategy")
            return data
        except Exception as e:
            print(f"Error handling missing values: {e}")
            return data

In [4]:
def normalize_data(self, data, method='minmax'):
        """
        Normalize the dataset using specified method
        Methods: 'minmax', 'standard'
        """
        try:
            if method == 'minmax':
                self.scaler = MinMaxScaler()
                normalized_data = self.scaler.fit_transform(data)
            elif method == 'standard':
                self.mean = np.mean(data, axis=0)
                self.std = np.std(data, axis=0)
                normalized_data = (data - self.mean) / self.std
            else:
                raise ValueError("Invalid normalization method")
            print(f"Data normalized using '{method}' method")
            return normalized_data
        except Exception as e:
            print(f"Error normalizing data: {e}")
            return data

In [5]:
def encode_categorical(self, data, column_index):
        """
        One-hot encode categorical variables
        """
        try:
            # Extract the categorical column
            categorical_col = data[:, column_index]
            
            # Get unique categories
            unique_categories = np.unique(categorical_col)
            
            # Create one-hot encoded columns
            encoded_cols = np.zeros((data.shape[0], len(unique_categories)))
            
            for i, category in enumerate(unique_categories):
                encoded_cols[:, i] = (categorical_col == category).astype(int)
            
            # Remove original column and append encoded columns
            data = np.delete(data, column_index, axis=1)
            data = np.hstack((data, encoded_cols))
            
            print(f"Categorical column at index {column_index} encoded successfully")
            return data
        except Exception as e:
            print(f"Error encoding categorical data: {e}")
            return data

In [6]:
def split_dataset(self, data, target_column_index, test_size=0.2):
        """
        Split dataset into training and testing sets
        """
        try:
            # Shuffle the dataset
            np.random.shuffle(data)
            
            # Split features and target
            X = np.delete(data, target_column_index, axis=1)
            y = data[:, target_column_index]
            
            # Calculate split index
            split_idx = int((1 - test_size) * len(data))
            X_train, X_test = X[:split_idx], X[split_idx:]
            y_train, y_test = y[:split_idx], y[split_idx:]
            
            print(f"Dataset split into training ({len(X_train)} samples) and testing ({len(X_test)} samples) sets")
            return X_train, X_test, y_train, y_test
        except Exception as e:
            print(f"Error splitting dataset: {e}")
            return None, None, None, None

In [8]:
# Example usage
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = DataPreprocessor()
    
    # Load dataset (replace with your actual file path)
    # Sample dataset should be in CSV format with numerical values
    data = preprocessor.load_dataset('dataset/HistoricalPrices.csv')
    
    if data is not None:
        # Handle missing values
        data = preprocessor.handle_missing_values(data, strategy='mean')
        
        # Normalize data
        data = preprocessor.normalize_data(data, method='standard')
        
        # If you have categorical data (uncomment and modify as needed)
        # data = preprocessor.encode_categorical(data, column_index=0)
        
        # Split dataset into features and target (assuming last column is target)
        X_train, X_test, y_train, y_test = preprocessor.split_dataset(
            data, target_column_index=-1, test_size=0.3)
        
        # Print shapes of resulting arrays
        print("\nFinal dataset shapes:")
        print(f"X_train: {X_train.shape}")
        print(f"X_test: {X_test.shape}")
        print(f"y_train: {y_train.shape}")
        print(f"y_test: {y_test.shape}")

AttributeError: 'DataPreprocessor' object has no attribute 'load_dataset'