In [2]:
import pandas as pd
import os

class DataReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None

    def read_data(self):
        try:
            file_extension = os.path.splitext(self.file_path)[1][1:]
            if file_extension.lower() == 'csv':
                self.data = pd.read_csv(self.file_path)
            elif file_extension.lower() in ['xls', 'xlsx']:
                self.data = pd.read_excel(self.file_path)
            elif file_extension.lower() == 'json':
                self.data = pd.read_json(self.file_path)
            else:
                raise ValueError("Unsupported file format.")
            print("Data loaded successfully.")
        except FileNotFoundError:
            print("File not found.")
        except Exception as e:
            print("Error reading file:", e)

    def summary_data(self):
        if self.data is not None:
            return self.data.describe()
        else:
            print("No data available. Please read data first.")

    def handle_missing_values(self, strategy='mean', columns=None, remove=False):
        if self.data is not None:
            try:
                if remove:
                    self.data.dropna(subset=columns, inplace=True)
                else:
                    for col in columns or self.data.columns:
                        if strategy == 'mean':
                            if self.data[col].dtype == 'O':
                                continue
                            self.data[col].fillna(self.data[col].mean(), inplace=True)
                        elif strategy == 'median':
                            if self.data[col].dtype == 'O':
                                continue
                            self.data[col].fillna(self.data[col].median(), inplace=True)
                        elif strategy == 'mode':
                            if self.data[col].dtype == 'O':
                                continue
                            self.data[col].fillna(self.data[col].mode().iloc[0], inplace=True)
                        else:
                            raise ValueError("Invalid imputation strategy.")
            except Exception as e:
                print("Error handling missing values:", e)
        else:
            print("No data available. Please read data first.")

    def categorical_encoding(self):
        if self.data is not None:
            try:
                self.data = pd.get_dummies(self.data, drop_first=True)
            except Exception as e:
                print("Error performing categorical encoding:", e)
        else:
            print("No data available. Please read data first.")

data_reader = DataReader(file_path='Employee.csv')

data_reader.read_data()

summary = data_reader.summary_data()
print("Summary Statistics:\n\n", summary)

data_reader.handle_missing_values(strategy='mean')

data_reader.categorical_encoding()

processed_data = data_reader.data
print("\n\n\nProcessed Data:\n\n", processed_data)


Data loaded successfully.
Summary Statistics:

              Id        BasePay    OvertimePay       OtherPay  Benefits  \
count  10.00000      10.000000      10.000000      10.000000       0.0   
mean    5.50000  167829.954000   60110.435000  136706.906000       NaN   
std     3.02765   67665.935153   77402.791856  117102.804852       NaN   
min     1.00000   77916.000000       0.000000   16452.600000       NaN   
25%     3.25000  122551.900000    2150.250000   42929.797500       NaN   
50%     5.50000  161688.600000   32928.855000  136118.760000       NaN   
75%     7.75000  203787.507500   88387.845000  187370.702500       NaN   
max    10.00000  285262.000000  245131.880000  400184.250000       NaN   

            TotalPay  TotalPayBenefits  
count      10.000000         10.000000  
mean   364647.295000     364647.295000  
std    100265.746605     100265.746605  
min    302377.730000     302377.730000  
25%    309919.857500     309919.857500  
50%    321329.465000     321329.465000 