In [1]:
import pandas as pd 
import seaborn as sns

In [None]:
from sklearn.datasets import load_wine  
# Import function to load the Wine dataset
data = load_wine()  
# Load Wine dataset into 'data' (features, labels, and info)

In [None]:
df = pd.DataFrame(
    data.data,               # Create a DataFrame using the feature data from 'data'
    columns = data.feature_names  # Use the feature names as column headers
)
df.head()                    # Display the first 5 rows of the DataFrame

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [26]:
df.info()  
# Display a summary of the DataFrame:
# - number of rows and columns
# - column names
# - data types
# - non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [None]:
# Check and print the number of missing (NULL) values in each column
print(f'NULL Values:\n{df.isnull().sum()}')
# Print a separator line for clarity
print("______________________")
# Check and print the number of duplicate rows in the DataFrame
print(f'Duplicate Values: {df.duplicated().sum()}')

NULL Values:
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64
______________________
Duplicate Values: 0


In [24]:
df_copy = df.copy()
iqr_factor = (1, 1.5, 2, 2.5, 3)
result = []
for k in iqr_factor:
    df_temp = df_copy.copy()
    for col in df:
        q1 = df[col].quantile(.25)
        q3 = df[col].quantile(.75)

        iqr = q3 - q1
        lower_whisker = q1 - k * iqr
        upper_whisker = q3 + k * iqr

        df_temp = df_temp[
            (df_temp[col] <= upper_whisker) &
            (df_temp[col] >= lower_whisker)
        ]
    row_removed = len(df_copy) - len(df_temp)

    row_loss_percent = row_removed/len(df_copy) * 100

    result.append((k, len(df_copy), len(df_temp), row_loss_percent))

df_out = pd.DataFrame(
    result,
    columns = ['iqr_factor', 'total_row', 'removed_row', 'row_loss_percent']
)
df_out

Unnamed: 0,iqr_factor,total_row,removed_row,row_loss_percent
0,1.0,178,118,33.707865
1,1.5,178,161,9.550562
2,2.0,178,173,2.808989
3,2.5,178,177,0.561798
4,3.0,178,178,0.0
