# Part-wise Examples

## 1. How to Handle Missing Data**


In [None]:
import pandas as pd
import numpy as np

# Simulated sales data with missing values
df = pd.DataFrame({
    'CustomerID': range(101, 111),
    'Age': [25, np.nan, 42, 30, 36, np.nan, 29, 41, np.nan, 33],
    'Income': [50000, 60000, np.nan, 72000, 58000, 61000, np.nan, 54000, 53000, 70000]
})
df.head()

Unnamed: 0,CustomerID,Age,Income
0,101,25.0,50000.0
1,102,,60000.0
2,103,42.0,
3,104,30.0,72000.0
4,105,36.0,58000.0


In [None]:
# Fill missing Age with mean
df.loc[:, 'Age'] = df['Age'].fillna(df['Age'].mean())
df.head()

Unnamed: 0,CustomerID,Age,Income
0,101,25.0,50000.0
1,102,33.714286,60000.0
2,103,42.0,
3,104,30.0,72000.0
4,105,36.0,58000.0


In [None]:
# Convert Income column to object dtype before filling with 'Unknown'
#df.loc[:, 'Income'] = df['Income'].astype(object)

# Fill missing Income with a constant
df.loc[:, 'Income'] = df['Income'].fillna(-1)

print(df)

   CustomerID        Age   Income
0         101  25.000000  50000.0
1         102  33.714286  60000.0
2         103  42.000000     -1.0
3         104  30.000000  72000.0
4         105  36.000000  58000.0
5         106  33.714286  61000.0
6         107  29.000000     -1.0
7         108  41.000000  54000.0
8         109  33.714286  53000.0
9         110  33.000000  70000.0


## 2. How to Handle Noisy Data

In [None]:
import pandas as pd
import numpy as np

# Simulated data with noise
data = pd.Series([15, 35, 22, 44, 55, 65, 25, 33, 47, 52, 80, 18])
print("Original Data:\n", data)



Original Data:
 0     15
1     35
2     22
3     44
4     55
5     65
6     25
7     33
8     47
9     52
10    80
11    18
dtype: int64


In [None]:
# Sort the data for equal frequency binning
data_sorted = data.sort_values().reset_index(drop=True)

# Define the number of bins
num_bins = 4

# Create bins
bins = np.array_split(data_sorted, num_bins)
print(bins)

[0    15
1    18
2    22
dtype: int64, 3    25
4    33
5    35
dtype: int64, 6    44
7    47
8    52
dtype: int64, 9     55
10    65
11    80
dtype: int64]


  return bound(*args, **kwds)


In [None]:
# Smooth by bin mean
smoothed_bins = [np.full(len(bin), np.mean(bin)) for bin in bins]
smoothed_data = np.concatenate(smoothed_bins)

print("\nSmoothed Data (using binning by mean):\n", smoothed_data)


Smoothed Data (using binning by mean):
 [18.33333333 18.33333333 18.33333333 31.         31.         31.
 47.66666667 47.66666667 47.66666667 66.66666667 66.66666667 66.66666667]


## 3. Data Transformation with Larger Dataset


In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

# Simulated salaries (in thousands)
data = np.array([[40], [60], [70], [90], [110], [130], [150], [170], [200], [220]])

# Min-Max Normalization
min_max = MinMaxScaler().fit_transform(data)
print("Min-Max:\n", min_max)

# Z-Score Normalization
z_score = StandardScaler().fit_transform(data)
print("\nZ-Score:\n", z_score)

Min-Max:
 [[0.        ]
 [0.11111111]
 [0.16666667]
 [0.27777778]
 [0.38888889]
 [0.5       ]
 [0.61111111]
 [0.72222222]
 [0.88888889]
 [1.        ]]

Z-Score:
 [[-1.45696386]
 [-1.1100677 ]
 [-0.93661962]
 [-0.58972347]
 [-0.24282731]
 [ 0.10406885]
 [ 0.450965  ]
 [ 0.79786116]
 [ 1.31820539]
 [ 1.66510155]]


## 4. PCA for Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

# Simulated 6-feature data from a product quality inspection
X = np.random.randint(10, 100, size=(10, 6))
X.shape


(10, 6)

In [None]:
pca = PCA(n_components=3)
reduced = pca.fit_transform(X)

print("Original Shape:", X.shape)
print("Reduced Shape:", reduced.shape)

Original Shape: (10, 6)
Reduced Shape: (10, 3)


## 5. Feature Selection on Iris Dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

X, y = load_iris(return_X_y=True)
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [None]:
selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)

print("Selected Features (first 5 rows):\n", X_new[:5])

Selected Features (first 5 rows):
 [[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]]


## 6. Data Compression via Sampling


In [None]:
# Simulated log of web traffic
df = pd.DataFrame({'Time': pd.date_range(start='2024-01-01', periods=100, freq='H'),
                   'Visits': np.random.randint(50, 500, size=100)})
df.shape


  df = pd.DataFrame({'Time': pd.date_range(start='2024-01-01', periods=100, freq='H'),


(100, 2)

In [None]:
# Take a 10% sample
sampled_df = df.sample(frac=0.1, random_state=42)

print(sampled_df.head())

                  Time  Visits
83 2024-01-04 11:00:00     451
53 2024-01-03 05:00:00     185
70 2024-01-03 22:00:00     303
45 2024-01-02 21:00:00     411
44 2024-01-02 20:00:00     488


# Full Experiment

In [None]:
#To perform a comprehensive data preprocessing workflow, including data processing, cleaning, transformation, reduction, compression, and normalization, on a dataset to prepare it for machine learning model training.
#Tools Required:
#Python
#Jupyter Notebook or any Python IDE, google colab
#Libraries: pandas, NumPy, scikit-learn, zlib, pickle

#Step 1: Data Loading and Initial Processing
#Load the dataset and perform initial checks.
import pandas as pd

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(url, header=None, names=columns)

# Display the first few rows and check for basic info
print(df.head())
print(df.info())


   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [None]:
#Step 2: Data Cleaning
#Identify and handle any missing values, duplicates, and outliers.

# Check for missing values
print(df.isnull().sum())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


In [None]:
# Remove duplicates if any
df.drop_duplicates(inplace=True)
df.shape

(147, 5)

In [None]:
# Handle outliers using z-score (optional, as Iris is clean, but we include this for completeness)
from scipy import stats
df = df[(np.abs(stats.zscore(df.iloc[:, :-1])) < 3).all(axis=1)]

print(df.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    146.000000   146.000000    146.000000   146.000000
mean       5.857534     3.046575      3.795890     1.214384
std        0.831852     0.423966      1.754909     0.757490
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.400000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.200000      6.900000     2.500000


In [None]:
#Step 3: Data Transformation
#Convert categorical variables into numerical values and create new features if necessary.
# Encode the categorical 'species' variable
df['species'] = pd.Categorical(df['species']).codes
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [None]:
# Feature engineering (e.g., creating a new feature)
df['sepal_area'] = df['sepal_length'] * df['sepal_width']

print(df.head())


   sepal_length  sepal_width  petal_length  petal_width  species  sepal_area
0           5.1          3.5           1.4          0.2        0       17.85
1           4.9          3.0           1.4          0.2        0       14.70
2           4.7          3.2           1.3          0.2        0       15.04
3           4.6          3.1           1.5          0.2        0       14.26
4           5.0          3.6           1.4          0.2        0       18.00


In [None]:
#Step 4: Data Reduction
#Reduce the dimensionality of the dataset using PCA.
from sklearn.decomposition import PCA

# Apply PCA to reduce to 3 principal components
pca = PCA(n_components=3)
reduced_df = pca.fit_transform(df.iloc[:, :-1])

# Convert the reduced data back to a DataFrame
reduced_df = pd.DataFrame(reduced_df, columns=['PC1', 'PC2', 'PC3'])
reduced_df['species'] = df['species']

print(reduced_df.head())


        PC1       PC2       PC3  species
0 -2.908963  0.322280  0.054001      0.0
1 -2.939349 -0.154125 -0.170253      0.0
2 -3.102009 -0.147564  0.055116      0.0
3 -2.968417 -0.304812  0.006228      0.0
4 -2.950296  0.320470  0.159210      0.0


In [None]:
#Step 5: Data Compression
#Compress the processed dataset to optimize storage.

import zlib
import pickle

# Compress the DataFrame
compressed_data = zlib.compress(pickle.dumps(df))

# Print the size of the compressed data
print("Compressed Data Size:", len(compressed_data))

# Decompress to verify the integrity
decompressed_data = pickle.loads(zlib.decompress(compressed_data))
print(decompressed_data.head())


Compressed Data Size: 2247
   sepal_length  sepal_width  petal_length  petal_width  species  sepal_area
0           5.1          3.5           1.4          0.2        0       17.85
1           4.9          3.0           1.4          0.2        0       14.70
2           4.7          3.2           1.3          0.2        0       15.04
3           4.6          3.1           1.5          0.2        0       14.26
4           5.0          3.6           1.4          0.2        0       18.00


In [None]:
#Step 6: Data Normalization
#Normalize the numerical features to ensure consistent scaling.

from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max scaling
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df.iloc[:, :-1])

# Convert back to DataFrame
scaled_df = pd.DataFrame(scaled_df, columns=df.columns[:-1])
scaled_df['species'] = df['species']

print(scaled_df.head())


   sepal_length  sepal_width  petal_length  petal_width  species
0      0.222222     0.681818      0.067797     0.041667      0.0
1      0.166667     0.454545      0.067797     0.041667      0.0
2      0.111111     0.545455      0.050847     0.041667      0.0
3      0.083333     0.500000      0.084746     0.041667      0.0
4      0.194444     0.727273      0.067797     0.041667      0.0


In [None]:
#Step 7: Model Training and Evaluation (Optional)
#If desired, students can train a simple machine learning model (e.g., Logistic Regression) on the processed and unprocessed data to observe the effects of preprocessing on model performance.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Drop rows where target (species) is NaN
scaled_df = scaled_df.dropna(subset=['species'])

# Split features and target
X = scaled_df.iloc[:, :-1]
y = scaled_df['species']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.91
