In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
class DataCleaner:
    def __init__(self, data): # parameters to put when I instantiate an object eg. DataCleaner(data)
        self.data = data      # this means value (data) that I put after (self, ) gets assigned to self.data attribute
                              # self.data in each method is the attribute 
                            
    def remove_duplicates(self):
        # Remove duplicate rows from the dataset
        self.data = self.data.drop_duplicates()    # self.data attribute taken from init function 

    def handle_missing_values(self, strategy='mean'):
        for column in self.data.columns:
            if self.data[column].dtype in [np.float64, np.int64]:
                if strategy == 'mean':
                    # Fill missing values with the mean of the column
                    self.data[column] = self.data[column].fillna(self.data[column].mean())
                elif strategy == 'median':
                    # Fill missing values with the median of the column
                    self.data[column] = self.data[column].fillna(self.data[column].median())
                elif strategy == 'mode':
                    # Fill missing values with the mode of the column
                    self.data[column] = self.data[column].fillna(self.data[column].mode().iloc[0])
                else:
                    raise ValueError("Invalid missing value strategy for numeric columns")

    def remove_outliers(self, column, threshold=3):
        # Remove outliers from the specified column based on z-scores
        z_scores = np.abs((self.data[column] - self.data[column].mean()) / self.data[column].std())
        self.data = self.data[z_scores < threshold]

    def convert_data_types(self, column_types):
        # Convert the data types of columns as specified
        self.data = self.data.astype(column_types)

    def clean_data(self, drop_duplicates=True, handle_missing=True, remove_outliers=True, convert_types=None, outlier_column=None):
        if drop_duplicates:
            # Call the remove_duplicates method to eliminate duplicate rows
            self.remove_duplicates()
        if handle_missing:
            # Handle missing values based on the specified strategy (default is 'mean' if not provided)
            self.handle_missing_values()
        if remove_outliers and outlier_column:
            # Remove outliers for the specified column (if both remove_outliers and outlier_column are provided)
            self.remove_outliers(column=outlier_column)  
        if convert_types:
            # Convert column data types as specified in the column_types parameter
            self.convert_data_types(convert_types)

    def get_cleaned_data(self):
        # Return the cleaned dataset
        return self.data

In [4]:
if __name__ == "__main__":
    # Example usage:
    # Load your dataset as a pandas DataFrame
    data = pd.read_csv('f500.csv')

    # Create an instance of DataCleaner
    cleaner = DataCleaner(data)

    # Clean the data
    cleaner.clean_data()l

    # Get the cleaned data
    cleaned_data = cleaner.get_cleaned_data()
    
    cleaned_data.to_csv('f500.csv')

SyntaxError: invalid syntax (<ipython-input-4-e0146b924e08>, line 10)