# Practice activity: Apply the preprocessing tool to a dummy dataset for ML application

In [2]:
import pandas as pd
import numpy as np

# Create a dummy dataset
np.random.seed(0)
dummy_data = {
    'Feature1': np.random.normal(100, 10, 100).tolist() + [np.nan, 200],  # Normally distributed with an outlier
    'Feature2': np.random.randint(0, 100, 102).tolist(),  # Random integers
    'Category': ['A', 'B', 'C', 'D'] * 25 + [np.nan, 'A'],  # Categorical with some missing values
    'Target': np.random.choice([0, 1], 102).tolist()  # Binary target variable
}

# Convert the dictionary to a DataFrame
df_dummy = pd.DataFrame(dummy_data)

# Display the first few rows of the dummy dataset
print(df_dummy.head())

     Feature1  Feature2 Category  Target
0  117.640523        32        A       1
1  104.001572        70        B       1
2  109.787380        85        C       0
3  122.408932        31        D       1
4  118.675580        13        A       0


In [6]:
print(df_dummy.tail())

       Feature1  Feature2 Category  Target
97   117.858705        35        B       1
98   101.269121        30        C       1
99   104.019894        29        D       1
100         NaN        33      NaN       1
101  200.000000        18        A       0


In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
#import missingno as msno

In [4]:
def load_data(df):
    return df

def handle_missing_values(df):
    return df.fillna(df.mean())  # For numeric data, fill missing values with the mean

def remove_outliers(df):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    return df[(z_scores < 3).all(axis=1)]  # Remove rows with any outliers

def scale_data(df):
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    return df

def encode_categorical(df, categorical_columns):
    return pd.get_dummies(df, columns=categorical_columns)

def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

In [9]:
df_preprocessed['Feature1'].mean()

101.58225757954938

In [5]:
# Load the data
df_preprocessed = load_data(df_dummy)

# Handle missing values
df_preprocessed = handle_missing_values(df_preprocessed)

# Remove outliers
df_preprocessed = remove_outliers(df_preprocessed)

# Scale the data
df_preprocessed = scale_data(df_preprocessed)

# Encode categorical variables
df_preprocessed = encode_categorical(df_preprocessed, ['Category'])

# Display the preprocessed data
print(df_preprocessed.head())

TypeError: can only concatenate str (not "int") to str

In [None]:
# Save the cleaned and preprocessed DataFrame to a CSV file
save_data(df_preprocessed, 'preprocessed_dummy_data.csv')

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')

In [10]:
print(df_preprocessed.isnull().sum())

Feature1    1
Feature2    0
Category    1
Target      0
dtype: int64


In [11]:
print(df_preprocessed.describe())

         Feature1    Feature2      Target
count  101.000000  102.000000  102.000000
mean   101.582258   45.754902    0.529412
std     14.121324   27.154649    0.501599
min     74.470102    0.000000    0.000000
25%     93.656779   27.250000    0.000000
50%    101.216750   40.500000    1.000000
75%    107.610377   69.000000    1.000000
max    200.000000   97.000000    1.000000
