# Data Wrangling Section
---

## Data import

In [1]:
# Import data.csv
import pandas as pd
df = pd.read_csv('../data/processed/cleaned.csv')

# Checking
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


## Check data types

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 250 non-null    int64 
 1   Gender              250 non-null    object
 2   Polyuria            250 non-null    object
 3   Polydipsia          250 non-null    object
 4   sudden weight loss  250 non-null    object
 5   weakness            250 non-null    object
 6   Polyphagia          250 non-null    object
 7   Genital thrush      250 non-null    object
 8   visual blurring     250 non-null    object
 9   Itching             250 non-null    object
 10  Irritability        250 non-null    object
 11  delayed healing     250 non-null    object
 12  partial paresis     250 non-null    object
 13  muscle stiffness    250 non-null    object
 14  Alopecia            250 non-null    object
 15  Obesity             250 non-null    object
 16  class               250 no

## Check missing value

In [3]:
# Check missing values
df.isnull().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

## Statistic check of Age

In [4]:
# Check statistics of Age
df.describe()

Unnamed: 0,Age
count,250.0
mean,48.7
std,12.276353
min,16.0
25%,39.0
50%,48.0
75%,58.0
max,85.0


## Check Duplicates

In [6]:
df.duplicated().sum()

np.int64(0)

## Drop Duplicates

In [7]:
print("Before →", df.shape)
df.drop_duplicates(inplace=True)
print("✔ Removed duplicates →", df.shape)

Before → (250, 17)
✔ Removed duplicates → (250, 17)


## Eliminate Outliers

In [8]:
import numpy as np

print("Old Shape:", df.shape)

Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# Delete outliers
df = df[(df['Age'] >= lower) & (df['Age'] <= upper)]

print("New Shape:", df.shape)

Old Shape: (250, 17)
New Shape: (250, 17)
