<a href="https://colab.research.google.com/github/Mo-null/Data-mining-I/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, delimiter=';')
df_original = df.copy()
target = 'quality'
print("Dataset loaded. Original shape:", df_original.shape)

Dataset loaded. Original shape: (1599, 12)


In [None]:

print("\n=== HANDLING MISSING VALUES ===")
missing_before = df.isnull().sum().sum()
if missing_before > 0:
    numerical_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col != target]
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
    missing_after = df.isnull().sum().sum()
    print(f"Fixed {missing_before - missing_after} missing values")
else:
    print("No missing values found")


=== HANDLING MISSING VALUES ===
No missing values found


In [None]:

print("\n=== HANDLING OUTLIERS ===")
numerical_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col != target]

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    df[col] = df[col].clip(lower_bound, upper_bound)
    if outliers_before > 0:
        print(f"Capped {outliers_before} outliers in '{col}'")


=== HANDLING OUTLIERS ===
Capped 49 outliers in 'fixed acidity'
Capped 19 outliers in 'volatile acidity'
Capped 1 outliers in 'citric acid'
Capped 155 outliers in 'residual sugar'
Capped 112 outliers in 'chlorides'
Capped 30 outliers in 'free sulfur dioxide'
Capped 55 outliers in 'total sulfur dioxide'
Capped 45 outliers in 'density'
Capped 35 outliers in 'pH'
Capped 59 outliers in 'sulphates'
Capped 13 outliers in 'alcohol'


In [None]:

print("\n=== APPLYING VALIDATION RULES ===")
validation_rules = {
    'alcohol': (0, 20),
    'pH': (2.5, 4.5),
    'fixed acidity': (0, 20)
}

for column, (min_val, max_val) in validation_rules.items():
    if column in df.columns:
        invalid_count = ((df[column] < min_val) | (df[column] > max_val)).sum()
        if invalid_count > 0:
            print(f"Fixed {invalid_count} invalid values in '{column}'")
            df[column] = df[column].clip(min_val, max_val)
        else:
            print(f"No invalid values found in '{column}'")


=== APPLYING VALIDATION RULES ===
No invalid values found in 'alcohol'
No invalid values found in 'pH'
No invalid values found in 'fixed acidity'


In [None]:

print("\n=== CLEANING SUMMARY ===")
print(f"Original dataset shape: {df_original.shape}")
print(f"Cleaned dataset shape: {df.shape}")
print(f"Columns cleaned: {len(numerical_cols)} numerical columns (excluding '{target}')")

df.to_csv('cleaned_wine_data.csv', index=False)
print("\n Data cleaning completed!")
print(" Cleaned dataset saved as 'cleaned_wine_data.csv'")


=== CLEANING SUMMARY ===
Original dataset shape: (1599, 12)
Cleaned dataset shape: (1599, 12)
Columns cleaned: 11 numerical columns (excluding 'quality')

 Data cleaning completed!
 Cleaned dataset saved as 'cleaned_wine_data.csv'
