<a href="https://colab.research.google.com/github/Juhainayasmin09/Project-set12/blob/main/set12_Juhaina.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the CSV file and examine its structure
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the data
df = pd.read_csv('SET-12.csv', encoding='ascii')

print("Dataset shape:", df.shape)
print("\
Column names:", df.columns.tolist())
print("\
First few rows:")
print(df.head())
print("\
Data types:")
print(df.dtypes)
print("\
Missing values:")
print(df.isnull().sum())

In [None]:
# Fill missing values with mean of the column (for numerical columns only)
print("Before filling missing values:")
print(df.isnull().sum())

# Fill missing values in numerical columns with their mean
numerical_cols = ['Math', 'Science', 'Age']
for col in numerical_cols:
    if col in df.columns and df[col].dtype in ['float64', 'int64']:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)
        print(f"Filled missing values in {col} with mean: {mean_value}")

print("After filling missing values:")
print(df.isnull().sum())
print("Dataset after filling missing values:")
print(df)

In [None]:
# Identify and drop duplicate records
print("Before removing duplicates:")
print("Dataset shape:", df.shape)
print("Duplicate rows:")
duplicates = df.duplicated()
print(duplicates.sum(), "duplicate rows found")

# Check for duplicates more carefully (considering whitespace in names)
df['Name'] = df['Name'].str.strip()  # Remove leading/trailing whitespace
print("\
After cleaning whitespace in names:")
duplicates_after_clean = df.duplicated()
print(duplicates_after_clean.sum(), "duplicate rows found")

# Drop duplicates
df_clean = df.drop_duplicates()
print("\
After removing duplicates:")
print("Dataset shape:", df_clean.shape)
print(df_clean)

In [None]:
# Apply normalization to numerical columns
print("Before normalization:")
print(df_clean[['Math', 'Science', 'Age']].describe())

# Initialize MinMaxScaler for normalization (scales to 0-1 range)
scaler = MinMaxScaler()

# Apply normalization to numerical columns
numerical_cols = ['Math', 'Science', 'Age']
df_normalized = df_clean.copy()

# Normalize each numerical column
for col in numerical_cols:
    df_normalized[col] = scaler.fit_transform(df_clean[[col]])

print("\
After normalization:")
print(df_normalized[['Math', 'Science', 'Age']].describe())
print("\
Normalized dataset:")
print(df_normalized)

In [None]:
# Discretize scores using binning into 3 categories
# We'll bin the Math and Science scores into Low, Medium, High categories

df_binned = df_normalized.copy()

# Define binning for Math and Science scores (using original values for better interpretation)
def create_bins(series, labels=['Low', 'Medium', 'High']):
    return pd.cut(series, bins=3, labels=labels)

# Apply binning to Math and Science columns (using original values)
df_binned['Math_Binned'] = create_bins(df_clean['Math'])
df_binned['Science_Binned'] = create_bins(df_clean['Science'])

print("Binned categories for Math and Science:")
print("\
Math score ranges:")
print(pd.cut(df_clean['Math'], bins=3).value_counts().sort_index())
print("\
Science score ranges:")
print(pd.cut(df_clean['Science'], bins=3).value_counts().sort_index())

print("\
Dataset with binned categories:")
print(df_binned[['Name', 'Math', 'Science', 'Math_Binned', 'Science_Binned']])

In [None]:
# Remove noise values using binning method
# The Age column has a negative value (-23) which appears to be noise
# We'll use binning to smooth out this noise

print("Original Age values:")
print(df_clean['Age'].values)

# Create bins for Age to remove noise
# We'll use equal-width binning and then replace values with bin centers
age_bins = pd.cut(df_clean['Age'], bins=3, precision=0)
print("\
Age bins:")
print(age_bins.value_counts().sort_index())

# Replace Age values with bin centers to remove noise
bin_centers = []
for interval in age_bins:
    if pd.notna(interval):
        center = (interval.left + interval.right) / 2
        bin_centers.append(center)
    else:
        bin_centers.append(np.nan)

df_final = df_binned.copy()
df_final['Age_Denoised'] = bin_centers

print("\
Age values after denoising (bin centers):")
print(df_final['Age_Denoised'].values)

print("\
Final processed dataset:")
print(df_final)

In [None]:
# Save the final processed dataset
df_final.to_csv('processed_dataset.csv', index=False)
print("Final processed dataset saved as 'processed_dataset.csv'")

# Summary of all preprocessing steps performed
print("\
=== DATA PREPROCESSING SUMMARY ===")
print("1. Loaded dataset with shape:", df.shape)
print("2. Filled missing values with column means")
print("3. Removed duplicate records (1 duplicate found and removed)")
print("4. Applied Min-Max normalization to numerical columns")
print("5. Created binned categories for Math and Science scores (Low/Medium/High)")
print("6. Applied denoising to Age column using binning method")
print("\
Final dataset shape:", df_final.shape)
print("Columns in final dataset:", df_final.columns.tolist())