In [None]:
# The following notebook demonstrates some of the integral methods of preprocessing data for your machine learning algorithm

In [4]:
# Problem 1: OneHotEncoder Basics
# Prompt: You have a dataset containing a single categorical column 'Fruit' with following values:
# '['Apple', 'Banana', 'Cherry', 'Apple', 'Cherry', 'Banana']
# Write a script to use 'OneHotEncoder' to transform this column into a binary matrix

import pandas as pd
data = ['Apple', 'Banana', 'Cherry', 'Apple', 'Cherry', 'Banana']
df = pd.DataFrame(data, columns=['Fruit'])
from sklearn.preprocessing import OneHotEncoder

# Using OneHotEncoder
df = pd.get_dummies(df, columns=['Fruit'],drop_first=0)
df

Unnamed: 0,Fruit_Apple,Fruit_Banana,Fruit_Cherry
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False
4,False,False,True
5,False,True,False


In [10]:
# Problem 2: StandardScaler Basics
# Prompt: You have a dataset containing a single numeric column 'Height' with the following values:
# '[150, 160, 170, 180, 190, 200]'
# Write a script to use 'StandardScaler' to standardize this column

import pandas as pd
data = [150, 160, 170, 180, 190, 200]
df = pd.DataFrame(data, columns=['Height'])
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
standard_scaler.fit_transform(df)

array([[-1.46385011],
       [-0.87831007],
       [-0.29277002],
       [ 0.29277002],
       [ 0.87831007],
       [ 1.46385011]])

In [26]:
# Problem 3: Combining OneHotEncoder and StandardScaler
# Prompt: You have a dataset with two columns: 'Age' and 'City'
# The 'Age' column contains numeric values '[23,45,31,35,27]'
# The 'City' column contains categorical values:
# '['New York', 'Los Angelas', 'Chicago', 'New York', 'Chicago']'
# Write a script to apply 'StandardScaler' to the appropriate columns

import pandas as pd
data = {'Age': [23,45,31,35,27],'City': ['New York', 'Los Angelas', 'Chicago', 'New York', 'Chicago']}
df = pd.DataFrame(data, columns=['Age','City'])

# StandardScaler
scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

# OneHotEncoder
df = pd.get_dummies(df, columns=['City'])
df

Unnamed: 0,Age,City_Chicago,City_Los Angelas,City_New York
0,-1.218998,False,False,True
1,1.695997,False,True,False
2,-0.159,True,False,False
3,0.370999,False,False,True
4,-0.688999,True,False,False


In [34]:
# Problem 4: Using a Pipeline
# Prompt: Create a pipeline that first preprocesses the respective data:
# The 'Salary' column contains '[50000, 60000, 75000, 80000]'
# The 'Department' column contains '['HR', 'Engineering', 'Sales', 'Engineering']'

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Create the DataFrame
data = {
    'Salary': [50000, 60000, 75000, 80000],
    'Department': ['HR', 'Engineering', 'Sales', 'Engineering']
}
df = pd.DataFrame(data)

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Salary']),
        ('cat', OneHotEncoder(), ['Department'])
    ]
)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
preprocessed_data = pipeline.fit_transform(df)

# Convert the result back to a DataFrame for better readability
preprocessed_df = pd.DataFrame(preprocessed_data, columns=['Salary_scaled', 'Department_Engineering', 'Department_HR', 'Department_Sales'])
preprocessed_df


Unnamed: 0,Salary_scaled,Department_Engineering,Department_HR,Department_Sales
0,-1.36277,0.0,1.0,0.0
1,-0.524142,1.0,0.0,0.0
2,0.733799,0.0,0.0,1.0
3,1.153113,1.0,0.0,0.0
