 Description: Clean and preprocess a raw dataset to
 make it suitable for analysis

In [21]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
print(df.columns)


Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')


Handle missing data (e.g., imputation, removal)

In [17]:
#Show basic info and check for missing values
print("\n--- Missing Value Check ---")
print(df.isnull().sum())

# Option 1: Remove rows with missing values (if few missing entries)
df_dropna = df.dropna()
print(f"\nAfter dropping missing rows: {df_dropna.shape[0]} rows remaining.")

# Option 2: Impute missing values with mean (for numeric columns)
df_imputed = df.fillna(df.mean(numeric_only=True))
print("\nMissing values after mean imputation:")
print(df_imputed.isnull().sum())

# Optional: View updated dataset
print("\n--- Preview of Imputed Dataset ---")
print(df_imputed.head())


--- Missing Value Check ---
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

After dropping missing rows: 506 rows remaining.

Missing values after mean imputation:
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

--- Preview of Imputed Dataset ---
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.062

 Detect and remove outliers.

In [18]:
# --- Detect & Remove Outliers using IQR method ---
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df_no_outliers = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

# Print before and after shape
print(f"Original shape: {df.shape}")
print(f"After removing outliers: {df_no_outliers.shape}")

# Optional: preview cleaned data
print("\n--- Preview of dataset without outliers ---")
print(df_no_outliers.head())

Original shape: (506, 14)
After removing outliers: (268, 14)

--- Preview of dataset without outliers ---
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


 Convert categorical variables into numerical format
 using one-hot encoding or label encoding

In [22]:
from sklearn.preprocessing import LabelEncoder

# Simulate 'rad' as categorical
df['rad'] = df['rad'].astype(str)

# --- Label Encoding ---
le = LabelEncoder()
df['rad_encoded'] = le.fit_transform(df['rad'])

# --- One-Hot Encoding ---
df_onehot = pd.get_dummies(df, columns=['rad'], prefix='rad')

# Output
print("\n--- Label Encoded 'rad' ---")
print(df[['rad_encoded']].head())

print("\n--- One-Hot Encoded 'rad' Columns ---")
print(df_onehot.filter(like='rad_').head())



--- Label Encoded 'rad' ---
   rad_encoded
0            0
1            1
2            1
3            3
4            3

--- One-Hot Encoded 'rad' Columns ---
   rad_encoded  rad_1  rad_2  rad_24  rad_3  rad_4  rad_5  rad_6  rad_7  rad_8
0            0   True  False   False  False  False  False  False  False  False
1            1  False   True   False  False  False  False  False  False  False
2            1  False   True   False  False  False  False  False  False  False
3            3  False  False   False   True  False  False  False  False  False
4            3  False  False   False   True  False  False  False  False  False


 Normalize or standardize numerical data

In [23]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# --- Separate features (X) and target (y) ---
X = df.drop('medv', axis=1)   # features
y = df['medv']                # target

# --- Normalization ---
minmax_scaler = MinMaxScaler()
X_normalized = pd.DataFrame(minmax_scaler.fit_transform(X), columns=X.columns)

# --- Standardization ---
standard_scaler = StandardScaler()
X_standardized = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)

# Preview results
print("\n--- Normalized Data ---")
print(X_normalized.head())

print("\n--- Standardized Data ---")
print(X_standardized.head())



--- Normalized Data ---
       crim    zn     indus  chas       nox        rm       age       dis  \
0  0.000000  0.18  0.067815   0.0  0.314815  0.577505  0.641607  0.269203   
1  0.000236  0.00  0.242302   0.0  0.172840  0.547998  0.782698  0.348962   
2  0.000236  0.00  0.242302   0.0  0.172840  0.694386  0.599382  0.348962   
3  0.000293  0.00  0.063050   0.0  0.150206  0.658555  0.441813  0.448545   
4  0.000705  0.00  0.063050   0.0  0.150206  0.687105  0.528321  0.448545   

        rad       tax   ptratio         b     lstat  rad_encoded  
0  0.000000  0.208015  0.287234  1.000000  0.089680        0.000  
1  0.043478  0.104962  0.553191  1.000000  0.204470        0.125  
2  0.043478  0.104962  0.553191  0.989737  0.063466        0.125  
3  0.086957  0.066794  0.648936  0.994276  0.033389        0.375  
4  0.086957  0.066794  0.648936  1.000000  0.099338        0.375  

--- Standardized Data ---
       crim        zn     indus      chas       nox        rm       age  \
0 -0.419

TASK: Data Cleaning and
 Preprocessing (Full Code)

In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)

print("\n--- Step 1: Original Dataset Info ---")
print(df.info())

# ========== Step 1: Handle Missing Data ==========

# Check for missing values
print("\n--- Step 2: Checking Missing Values ---")
print(df.isnull().sum())

# Impute missing values with mean (if any)
df = df.fillna(df.mean(numeric_only=True))


# ========== Step 2: Remove Outliers (IQR method) ==========

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows that have outliers in any column
df = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

print(f"\n--- Step 3: Dataset shape after removing outliers: {df.shape} ---")


# ========== Step 3: Encode Categorical Variables ==========

# 'chas' is a binary categorical variable (0 or 1), already numerical.
# Simulate 'rad' as categorical for encoding purposes
df['rad'] = df['rad'].astype(str)  # Treat 'rad' as categorical

# Label Encoding for 'rad'
le = LabelEncoder()
df['rad_encoded'] = le.fit_transform(df['rad'])

# Optional: One-Hot Encoding (instead of label encoding)
df = pd.get_dummies(df, columns=['rad'], prefix='rad')

print("\n--- Step 4: Categorical Encoding Done ---")
print(df.head())


# ========== Step 4: Normalize or Standardize Data ==========

# Separate target
target = df['medv']
features = df.drop(['medv'], axis=1)

# Option 1: Normalize (Min-Max Scaling)
minmax_scaler = MinMaxScaler()
features_normalized = pd.DataFrame(minmax_scaler.fit_transform(features), columns=features.columns)

# Option 2: Standardize (Z-score Scaling)
standard_scaler = StandardScaler()
features_standardized = pd.DataFrame(standard_scaler.fit_transform(features), columns=features.columns)

# Combine with target (optional)
df_normalized = features_normalized.copy()
df_normalized['medv'] = target.reset_index(drop=True)

df_standardized = features_standardized.copy()
df_standardized['medv'] = target.reset_index(drop=True)

print("\n--- Step 5: Normalized & Standardized Data Preview ---")
print("Normalized:")
print(df_normalized.head())
print("\nStandardized:")
print(df_standardized.head())



--- Step 1: Original Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
None

--- Step 2: Checking Missing Values ---
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b      