MANJIT SINGH T117

PRACTICAL NO 3B

AIM - Feature Scaling and Dummification:

 Apply feature-scaling techniques like standardization and normalization to
numerical features.

 Perform feature dummification to convert categorical variables into numerical
representations.

Handling Categorial Data and imbalanced classes

Encoding Nominal Categorical Feature
Using LabelBinarizer

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

# load your dataset
df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# Handle NaN values in 'Customer_Segment' by filling them with a placeholder string
df['Customer_Segment'] = df['Customer_Segment'].fillna('Missing')

# create a feature (Customer_Segment column)
feature = df[['Customer_Segment']].values

# create one-hot encoder
one_hot = LabelBinarizer()

# one-hot encode feature
print(one_hot.fit_transform(feature)[:5])  # display first 5 rows

print(one_hot.classes_[:5])  # display first 5 classes

print(one_hot.inverse_transform(one_hot.transform(feature))[:5])  # display first 5 inverse transformed rows

[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]]
['High Value' 'Low Value' 'Medium Value' 'Missing']
['High Value' 'High Value' 'High Value' 'Medium Value' 'High Value']


Multilabel Example

In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# Handle NaN values in 'Customer_Segment' by filling them with a placeholder string
df['Customer_Segment'] = df['Customer_Segment'].fillna('Missing_Customer_Segment')

# Convert 'Sales_Before' to appropriate string labels, handling NaNs
sales_before_labels = df['Sales_Before'].apply(
    lambda x: str(int(x)) if pd.notna(x) else 'Missing_Sales_Value'
)

# create a fake multilabel column using existing dataset values
multiclass_feature = list(zip(df['Customer_Segment'], sales_before_labels))

# create multilabel one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()

# one-hot encode multiclass feature
print(one_hot_multiclass.fit_transform(multiclass_feature)[:5])  # display first 5 rows
print(one_hot_multiclass.classes_[:5])  # display first 5 classes

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]
['100' '101' '102' '103' '104']


Encoding Ordinal Categorical Features

In [8]:
import pandas as pd

df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

df['Price_Level'] = pd.cut(
    df['Sales_Before'],  # Changed 'Selling_Price' to 'Sales_Before'
    bins=[0, 150, 250, 400], # Adjusted bins to better fit 'Sales_Before' data range
    labels=['Low', 'Medium', 'High']
)

scale_mapper = {"Low": 1, "Medium": 2, "High": 3}

df['Price_Level'] = (
    df['Price_Level'].cat.rename_categories(scale_mapper)
)

df['Price_Level'].head()

Unnamed: 0,Price_Level
0,2
1,2
2,2
3,2
4,2


Encoding Dictionaries of Features

In [10]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd

# load dataset
df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# Handle NaN values in 'Group' and 'Customer_Segment'
df['Group'] = df['Group'].fillna('Missing_Group')
df['Customer_Segment'] = df['Customer_Segment'].fillna('Missing_Customer_Segment')

# convert rows to dictionaries using existing columns
data_dict = df[['Group', 'Customer_Segment', 'Sales_Before']].to_dict(orient='records')

# create dictionary vectorizer
dictvectorizer = DictVectorizer(sparse=False)

# convert dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)

print(features[:5])  # display first 5 rows

print(dictvectorizer.get_feature_names_out()[:5])  # display first 5 feature names

[[  1.           0.           0.           0.           1.
    0.           0.         240.5483589 ]
 [  1.           0.           0.           0.           0.
    0.           1.         246.86211421]
 [  1.           0.           0.           0.           1.
    0.           0.         156.97808448]
 [  0.           0.           1.           0.           1.
    0.           0.         192.12670834]
 [  1.           0.           0.           0.           0.
    1.           0.         229.68562253]]
['Customer_Segment=High Value' 'Customer_Segment=Low Value'
 'Customer_Segment=Medium Value'
 'Customer_Segment=Missing_Customer_Segment' 'Group=Control']


Imputing Missing Class Values

In [12]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

# load dataset
df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# select numeric feature + target categorical feature
# Corrected to use existing columns: 'Sales_Before' for X and 'Customer_Segment' for y
X = df[['Sales_Before']].values
y = df['Customer_Segment'].values

# create data with missing categorical values artificially
X_with_nan = np.array([
    [df['Sales_Before'][0]],
    [np.nan],    # missing value
    [df['Sales_Before'][2]]
])

y_with_nan = np.array([
    y[0],
    None,        # missing
    y[2]
])

# train KNN learner on available (non-missing) numeric rows
# ensure n_neighbors does not exceed the number of training samples
train_idx = [0, 2]
X_train = X[train_idx]
y_train = y[train_idx]
n_neighbors = max(1, min(3, len(X_train)))
clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
clf.fit(X_train, y_train)

# impute missing numeric values before predicting
imputer = SimpleImputer(strategy='mean')
imputer.fit(X)  # compute imputation value from available data

X_with_nan_imputed = imputer.transform(X_with_nan)

# predict missing class using imputed numeric value
imputed_values = clf.predict(X_with_nan_imputed[[1]])

# join predicted values with other features (replace missing label)
y_with_imputed = y_with_nan.copy()
y_with_imputed[1] = imputed_values[0]

# keep imputed numeric feature array
X_with_imputed = X_with_nan_imputed

# optional: show results
print("Imputed numeric rows:\n", X_with_imputed)
print("Imputed labels:\n", y_with_imputed)

Imputed numeric rows:
 [[240.5483589 ]
 [203.71699823]
 [156.97808448]]
Imputed labels:
 ['High Value' 'High Value' 'High Value']


Alternative: Fill Missing Values With Most Frequent

In [15]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# Handle NaN values in 'Customer_Segment' by filling them with a placeholder string
df['Customer_Segment'] = df['Customer_Segment'].fillna('Missing')

# create feature matrix (single categorical column)
X = df[['Customer_Segment']].values

# create imputer
imputer = SimpleImputer(missing_values='Missing', strategy='most_frequent') # Now imputing 'Missing' values

# impute values
imputed_X = imputer.fit_transform(X)

imputed_X[:6]  # display first 6 rows

array([['High Value'],
       ['High Value'],
       ['High Value'],
       ['Medium Value'],
       ['High Value'],
       ['Low Value']], dtype=object)

Handling Imbalanced Classes

In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# load dataset
df = pd.read_csv("Sales_with_NaNs_v1.3.csv")

# Impute NaNs in 'Sales_After' if it will be used as a feature
imputer = SimpleImputer(strategy='mean')
df['Sales_After'] = imputer.fit_transform(df[['Sales_After']])

# create features + target using existing numerical columns
features = df[['Sales_Before', 'Sales_After']].values

# Create an imbalanced target: e.g., sales in the top 10%
# This creates a binary target where 1 represents sales in the top 10% and 0 for the rest.
threshold = df['Sales_Before'].quantile(0.90)
target = np.where(df['Sales_Before'] > threshold, 1, 0)

print("Original target distribution:", np.unique(target, return_counts=True))

# Example class weights for an imbalanced target (adjust based on actual imbalance)
# If target 0 is majority and target 1 is minority
# weights = {0: 1.0, 1: num_0 / num_1} where num_0 and num_1 are counts
weights = {0: 0.9, 1: 0.1}

RandomForestClassifier(class_weight=weights)

Original target distribution: (array([0, 1]), array([9152,  848]))


In [18]:
# ensure reproducible sampling
np.random.seed(0)

i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

# determine majority and minority
if n_class0 > n_class1:
	majority_idx, minority_idx = i_class0, i_class1
else:
	majority_idx, minority_idx = i_class1, i_class0

# Prepare default empty results so variables exist even if sampling is skipped
features_downsampled = np.empty((0, features.shape[1]), dtype=features.dtype)
targets_downsampled = np.array([], dtype=target.dtype)
features_upsampled = np.empty((0, features.shape[1]), dtype=features.dtype)
targets_upsampled = np.array([], dtype=target.dtype)

# If one of the classes is missing, avoid sampling and inform the user
if len(majority_idx) == 0 or len(minority_idx) == 0:
	print("Sampling skipped: only one class present in 'target'.")
	print("Class counts -> class0: {}, class1: {}".format(n_class0, n_class1))
else:
	# Downsample majority to match minority size (no replacement)
	majority_downsampled = np.random.choice(majority_idx, size=len(minority_idx), replace=False)

	# Combine downsampled targets and features
	targets_downsampled = np.hstack((target[minority_idx], target[majority_downsampled]))
	features_downsampled = np.vstack((features[minority_idx, :], features[majority_downsampled, :]))

	print("Downsampled target distribution:", np.unique(targets_downsampled, return_counts=True))
	print("Downsampled features (first 5 rows):\n", features_downsampled[:5])

	# Upsample minority to match majority size (with replacement)
	minority_upsampled = np.random.choice(minority_idx, size=len(majority_idx), replace=True)

	targets_upsampled = np.concatenate((target[majority_idx], target[minority_upsampled]))
	features_upsampled = np.vstack((features[majority_idx, :], features[minority_upsampled, :]))

	print("\n")
	print("Upsampled target distribution:", np.unique(targets_upsampled, return_counts=True))
	print("Upsampled features (first 5 rows):\n", features_upsampled[:5])


Downsampled target distribution: (array([0, 1]), array([848, 848]))
Downsampled features (first 5 rows):
 [[306.70145235 485.1354245 ]
 [280.11381479 445.85800202]
 [329.8606829  393.67290836]
 [306.21071649 463.27418231]
 [306.11056933 465.54515526]]


Upsampled target distribution: (array([0, 1]), array([9152, 9152]))
Upsampled features (first 5 rows):
 [[240.5483589  300.00756753]
 [246.86211421 381.33755453]
 [156.97808448 179.33046351]
 [192.12670834 229.27803114]
 [229.68562253 280.45795221]]
