<a href="https://colab.research.google.com/github/Juhainayasmin09/Project-set12/blob/main/set12_Juhaina.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Load the CSV file and examine its structure
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the data
df = pd.read_csv('SET-12.csv', encoding='ascii')

print("Dataset shape:", df.shape)
print("\
Column names:", df.columns.tolist())
print("\
First few rows:")
print(df.head())
print("\
Data types:")
print(df.dtypes)
print("\
Missing values:")
print(df.isnull().sum())

Dataset shape: (6, 4)
Column names: ['Name', 'Math', 'Science', 'Age']
First few rows:
    Name  Math  Science  Age
0    Raj  65.0      NaN   20
1   Neha   NaN     70.0   25
2   Ajay  80.0     85.0  -23
3   Sita  75.0     90.0   24
4  Ajay   80.0     85.0  -23
Data types:
Name        object
Math       float64
Science    float64
Age          int64
dtype: object
Missing values:
Name       1
Math       1
Science    1
Age        0
dtype: int64


In [3]:
# Fill missing values with mean of the column (for numerical columns only)
print("Before filling missing values:")
print(df.isnull().sum())

# Fill missing values in numerical columns with their mean
numerical_cols = ['Math', 'Science', 'Age']
for col in numerical_cols:
    if col in df.columns and df[col].dtype in ['float64', 'int64']:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)
        print(f"Filled missing values in {col} with mean: {mean_value}")

print("\
After filling missing values:")
print(df.isnull().sum())
print("\
Dataset after filling missing values:")
print(df)

Before filling missing values:
Name       1
Math       1
Science    1
Age        0
dtype: int64
Filled missing values in Math with mean: 74.0
Filled missing values in Science with mean: 78.0
Filled missing values in Age with mean: 7.166666666666667
After filling missing values:
Name       1
Math       0
Science    0
Age        0
dtype: int64
Dataset after filling missing values:
    Name  Math  Science  Age
0    Raj  65.0     78.0   20
1   Neha  74.0     70.0   25
2   Ajay  80.0     85.0  -23
3   Sita  75.0     90.0   24
4  Ajay   80.0     85.0  -23
5    NaN  70.0     60.0   20


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)


In [4]:
# Identify and drop duplicate records
print("Before removing duplicates:")
print("Dataset shape:", df.shape)
print("Duplicate rows:")
duplicates = df.duplicated()
print(duplicates.sum(), "duplicate rows found")

# Check for duplicates more carefully (considering whitespace in names)
df['Name'] = df['Name'].str.strip()  # Remove leading/trailing whitespace
print("\
After cleaning whitespace in names:")
duplicates_after_clean = df.duplicated()
print(duplicates_after_clean.sum(), "duplicate rows found")

# Drop duplicates
df_clean = df.drop_duplicates()
print("\
After removing duplicates:")
print("Dataset shape:", df_clean.shape)
print(df_clean)

Before removing duplicates:
Dataset shape: (6, 4)
Duplicate rows:
0 duplicate rows found
After cleaning whitespace in names:
1 duplicate rows found
After removing duplicates:
Dataset shape: (5, 4)
   Name  Math  Science  Age
0   Raj  65.0     78.0   20
1  Neha  74.0     70.0   25
2  Ajay  80.0     85.0  -23
3  Sita  75.0     90.0   24
5   NaN  70.0     60.0   20


In [5]:
# Apply normalization to numerical columns
print("Before normalization:")
print(df_clean[['Math', 'Science', 'Age']].describe())

# Initialize MinMaxScaler for normalization (scales to 0-1 range)
scaler = MinMaxScaler()

# Apply normalization to numerical columns
numerical_cols = ['Math', 'Science', 'Age']
df_normalized = df_clean.copy()

# Normalize each numerical column
for col in numerical_cols:
    df_normalized[col] = scaler.fit_transform(df_clean[[col]])

print("\
After normalization:")
print(df_normalized[['Math', 'Science', 'Age']].describe())
print("\
Normalized dataset:")
print(df_normalized)

Before normalization:
            Math    Science        Age
count   5.000000   5.000000   5.000000
mean   72.800000  76.600000  13.200000
std     5.630275  11.949895  20.364184
min    65.000000  60.000000 -23.000000
25%    70.000000  70.000000  20.000000
50%    74.000000  78.000000  20.000000
75%    75.000000  85.000000  24.000000
max    80.000000  90.000000  25.000000
After normalization:
           Math   Science       Age
count  5.000000  5.000000  5.000000
mean   0.520000  0.553333  0.754167
std    0.375352  0.398330  0.424254
min    0.000000  0.000000  0.000000
25%    0.333333  0.333333  0.895833
50%    0.600000  0.600000  0.895833
75%    0.666667  0.833333  0.979167
max    1.000000  1.000000  1.000000
Normalized dataset:
   Name      Math   Science       Age
0   Raj  0.000000  0.600000  0.895833
1  Neha  0.600000  0.333333  1.000000
2  Ajay  1.000000  0.833333  0.000000
3  Sita  0.666667  1.000000  0.979167
5   NaN  0.333333  0.000000  0.895833


In [6]:
# Discretize scores using binning into 3 categories
# We'll bin the Math and Science scores into Low, Medium, High categories

df_binned = df_normalized.copy()

# Define binning for Math and Science scores (using original values for better interpretation)
def create_bins(series, labels=['Low', 'Medium', 'High']):
    return pd.cut(series, bins=3, labels=labels)

# Apply binning to Math and Science columns (using original values)
df_binned['Math_Binned'] = create_bins(df_clean['Math'])
df_binned['Science_Binned'] = create_bins(df_clean['Science'])

print("Binned categories for Math and Science:")
print("\
Math score ranges:")
print(pd.cut(df_clean['Math'], bins=3).value_counts().sort_index())
print("\
Science score ranges:")
print(pd.cut(df_clean['Science'], bins=3).value_counts().sort_index())

print("\
Dataset with binned categories:")
print(df_binned[['Name', 'Math', 'Science', 'Math_Binned', 'Science_Binned']])

Binned categories for Math and Science:
Math score ranges:
Math
(64.985, 70.0]    2
(70.0, 75.0]      2
(75.0, 80.0]      1
Name: count, dtype: int64
Science score ranges:
Science
(59.97, 70.0]    2
(70.0, 80.0]     1
(80.0, 90.0]     2
Name: count, dtype: int64
Dataset with binned categories:
   Name      Math   Science Math_Binned Science_Binned
0   Raj  0.000000  0.600000         Low         Medium
1  Neha  0.600000  0.333333      Medium            Low
2  Ajay  1.000000  0.833333        High           High
3  Sita  0.666667  1.000000      Medium           High
5   NaN  0.333333  0.000000         Low            Low


In [7]:
# Remove noise values using binning method
# The Age column has a negative value (-23) which appears to be noise
# We'll use binning to smooth out this noise

print("Original Age values:")
print(df_clean['Age'].values)

# Create bins for Age to remove noise
# We'll use equal-width binning and then replace values with bin centers
age_bins = pd.cut(df_clean['Age'], bins=3, precision=0)
print("\
Age bins:")
print(age_bins.value_counts().sort_index())

# Replace Age values with bin centers to remove noise
bin_centers = []
for interval in age_bins:
    if pd.notna(interval):
        center = (interval.left + interval.right) / 2
        bin_centers.append(center)
    else:
        bin_centers.append(np.nan)

df_final = df_binned.copy()
df_final['Age_Denoised'] = bin_centers

print("\
Age values after denoising (bin centers):")
print(df_final['Age_Denoised'].values)

print("\
Final processed dataset:")
print(df_final)

Original Age values:
[ 20  25 -23  24  20]
Age bins:
Age
(-23.0, -7.0]    1
(-7.0, 9.0]      0
(9.0, 25.0]      4
Name: count, dtype: int64
Age values after denoising (bin centers):
[ 17.  17. -15.  17.  17.]
Final processed dataset:
   Name      Math   Science       Age Math_Binned Science_Binned  Age_Denoised
0   Raj  0.000000  0.600000  0.895833         Low         Medium          17.0
1  Neha  0.600000  0.333333  1.000000      Medium            Low          17.0
2  Ajay  1.000000  0.833333  0.000000        High           High         -15.0
3  Sita  0.666667  1.000000  0.979167      Medium           High          17.0
5   NaN  0.333333  0.000000  0.895833         Low            Low          17.0


In [8]:
# Save the final processed dataset
df_final.to_csv('processed_dataset.csv', index=False)
print("Final processed dataset saved as 'processed_dataset.csv'")

# Summary of all preprocessing steps performed
print("\
=== DATA PREPROCESSING SUMMARY ===")
print("1. Loaded dataset with shape:", df.shape)
print("2. Filled missing values with column means")
print("3. Removed duplicate records (1 duplicate found and removed)")
print("4. Applied Min-Max normalization to numerical columns")
print("5. Created binned categories for Math and Science scores (Low/Medium/High)")
print("6. Applied denoising to Age column using binning method")
print("\
Final dataset shape:", df_final.shape)
print("Columns in final dataset:", df_final.columns.tolist())

Final processed dataset saved as 'processed_dataset.csv'
=== DATA PREPROCESSING SUMMARY ===
1. Loaded dataset with shape: (6, 4)
2. Filled missing values with column means
3. Removed duplicate records (1 duplicate found and removed)
4. Applied Min-Max normalization to numerical columns
5. Created binned categories for Math and Science scores (Low/Medium/High)
6. Applied denoising to Age column using binning method
Final dataset shape: (5, 7)
Columns in final dataset: ['Name', 'Math', 'Science', 'Age', 'Math_Binned', 'Science_Binned', 'Age_Denoised']
