In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Reading:
- Objective: Implement functions that can read data from different file formats such as CSV, Excel, and JSON.
- Tools: Use Pandas for efficient data importing.

In [45]:
def csv(self, filePath):
    return pd.read_csv(filePath)
    
def exel(self, filePath):
    return pd.read_excel(filePath)
    
def json(self, filePath):
    return pd.read_json(filePath)

## 2. Data Summary:
- Objective: Develop functions to print key statistical summaries of the data, including metrics like the average and most frequent values.
- Tools: Utilize NumPy and Pandas to generate these summaries.

In [41]:
def summary(df):
        summary = {
            'Average': np.mean(df),
            'Most frequent values': pd.Series(df).mode()
        }
        print("Statistical Summary:")
        for key, value in summary.items():
            print(f"{key}: {value}")

## 3. Handling Missing Values:
- Objective: Create functions for addressing missing values, offering solutions to either remove or impute them based on set strategies.
- Tools: Employ methods that ensure data integrity.

In [44]:
class Missing:
    def __init__(self, df):
        self.df = df
        
    def remove(self, threshold=0.5):
        missing_ratios = self.df.isnull().mean()
        columns_to_drop = missing_ratios[missing_ratios > threshold].index
        self.df = self.df.drop(columns_to_drop, axis=1)
    
    def impute(self, strategy):
        if strategy == 'mean':
            self.df = self.df.fillna(self.df.mean())
        elif strategy == 'median':
            self.df = self.df.fillna(self.df.median())
        elif strategy == 'mode':
            self.df = self.df.fillna(self.df.mode().iloc[0])
        elif strategy == 'constant':
            self.df = self.df.fillna(0)
        else:
            raise ValueError("Invalid imputation strategy. Choose from 'mean', 'median', 'mode', 'constant', etc.")
    
    def get_processed_data(self):
        return self.df

## 4. Categorical Data Encoding:
- Objective: Design functions for encoding categorical data, allowing their conversion into numerical formats for analysis.
- Tools: Implement encoding techniques effectively.

In [46]:
def one_hot_encode(df, columns=None):
    if columns is None:
        columns = df.select_dtypes(include=['object']).columns
    encoded_df = pd.get_dummies(df, columns=columns, drop_first=True)
    return encoded_df

def label_encode(series):
    encoded_series, _ = pd.factorize(series)
    return encoded_series