In [1]:
import pandas as pd

In [2]:
import numpy as np

In [19]:
from sklearn.preprocessing import LabelEncoder

In [5]:
def read_csv(file_path):
    """    
    Parameters:
        file_path (str): Path to the CSV file.
    
    Returns:
        pandas.DataFrame: DataFrame containing the data from the CSV file.
    """
    return pd.read_csv(file_path)

In [6]:
def read_excel(file_path, sheet_name=0):
    """    
    Parameters:
        file_path (str): Path to the Excel file.
        sheet_name (str or int, default 0): Name or index of the sheet to read.
    
    Returns a DataFrame containing the data from the Excel file.
    """
    return pd.read_excel(file_path, sheet_name=sheet_name)

In [7]:
def read_json(file_path):
    """    
    Parameters:
        file_path (str): Path to the JSON file.
    
    Returns:
        pandas.DataFrame: DataFrame containing the data from the JSON file.
    """
    return pd.read_json(file_path)

In [12]:
def generate_summary(data):
    """
    Generate key statistical summaries of the data.
    
    Parameters:
        data (array-like): The input data.
        
    Returns:
        dict: A dictionary containing key statistical summaries.
    """
    summary = {}
    data_series = pd.Series(data)
    
    summary['mean'] = np.mean(data)
    summary['median'] = np.median(data)
    summary['std_dev'] = np.std(data)
    summary['min'] = np.min(data)
    summary['max'] = np.max(data)
    summary['most_frequent'] = data_series.mode()[0]
    
    print(summary)

In [13]:
generate_summary([1,2,3,4])

{'mean': 2.5, 'median': 2.5, 'std_dev': 1.118033988749895, 'min': 1, 'max': 4, 'most_frequent': 1}


In [26]:
def handle_missing_values(df, strategy='mean'):
    if strategy == 'remove':
        return df.dropna()
    elif strategy in ['mean', 'median', 'mode']:
        fill_value = getattr(df.apply(pd.to_numeric, errors='coerce'), strategy)()
        return df.fillna(fill_value)
    else:
        raise ValueError("Invalid strategy. Choose from 'mean', 'median', 'mode', or 'remove'.")

In [15]:
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [10, np.nan, 30, np.nan, 50],
        'C': ['foo', 'bar', 'baz', np.nan, 'qux']}

In [16]:
df = pd.DataFrame(data)

In [17]:
df

Unnamed: 0,A,B,C
0,1.0,10.0,foo
1,2.0,,bar
2,,30.0,baz
3,4.0,,
4,5.0,50.0,qux


In [18]:
df_handled_remove = handle_missing_values(df, strategy='remove')
print("\nDataFrame with missing values removed:")
print(df_handled_remove)


DataFrame with missing values removed:
     A     B    C
0  1.0  10.0  foo
4  5.0  50.0  qux


In [20]:
def label_encode(df, columns):
   
    df_encoded = df.copy()
    label_encoders = {}
    for col in columns:
        label_encoders[col] = LabelEncoder()
        df_encoded[col] = label_encoders[col].fit_transform(df[col])
    return df_encoded, label_encoders

In [21]:
data = {
    'color': ['red', 'blue', 'green', 'green', 'red'],
    'size': ['small', 'medium', 'large', 'medium', 'large']
}

In [22]:
df = pd.DataFrame(data)

In [25]:
df

Unnamed: 0,color,size
0,red,small
1,blue,medium
2,green,large
3,green,medium
4,red,large


In [23]:
encoded_df, label_encoders = label_encode(df, columns=['color', 'size'])
print("Label Encoded DataFrame:")
print(encoded_df)

Label Encoded DataFrame:
   color  size
0      2     2
1      0     1
2      1     0
3      1     1
4      2     0
