#Pertemuan 12: Data Cleaning and Preparation using Pandas

In [None]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
S


After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [None]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)


Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


## Practice Tasks
- Load a dataset of your choice and identify missing values.
- Implement data transformations to normalize numerical columns.
- Standardize categorical columns and remove duplicates.

In [None]:
import pandas as pd

data_kerja = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 8],
    'Name': ['Gema', 'Adam', 'Resnu', 'Santoso', 'Fadhilla',
             'Zharifa', 'Prasetya', 'Agus', 'Fitri', 'Hendra', 'Adam', 'Agus'],
    'Age': [29, 34, 27, None, 31, 40, 22, 45, None, 28, 34, 45],
    'Salary': [15000, 20000, None, 18000, 25000, 30000,
               12000, None, 10000, 16000, 20000, None],
    'Job': ['TI', None, 'Ti', 'Bussinessman', 'Manager', None, 'Ti', 'Manager', 'TI', 'Ti', None, 'Manager']
}
df = pd.DataFrame(data_kerja)
print('Data Gaji Penduduk :', '\n' ,df, '\n')

print('Data yang hilang :')
print(df.isnull().sum(), '\n')

# Perbaikan Data
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df['Job'] = df['Job'].str.lower()
df.dropna(subset=['Name'], inplace=True)
print('Data Setelah Perbaikan :')
print(df)

# Penghapusan data Duplikat
df.drop_duplicates(inplace=True)

print('After cleaning:\n', df)



Data Gaji Penduduk : 
     ID      Name   Age   Salary           Job
0    1      Gema  29.0  15000.0            TI
1    2      Adam  34.0  20000.0          None
2    3     Resnu  27.0      NaN            Ti
3    4   Santoso   NaN  18000.0  Bussinessman
4    5  Fadhilla  31.0  25000.0       Manager
5    6   Zharifa  40.0  30000.0          None
6    7  Prasetya  22.0  12000.0            Ti
7    8      Agus  45.0      NaN       Manager
8    9     Fitri   NaN  10000.0            TI
9   10    Hendra  28.0  16000.0            Ti
10   2      Adam  34.0  20000.0          None
11   8      Agus  45.0      NaN       Manager 

Data yang hilang :
ID        0
Name      0
Age       2
Salary    3
Job       3
dtype: int64 

Data Setelah Perbaikan :
    ID      Name   Age   Salary           Job
0    1      Gema  29.0  15000.0            ti
1    2      Adam  34.0  20000.0          None
2    3     Resnu  27.0  18000.0            ti
3    4   Santoso  33.5  18000.0  bussinessman
4    5  Fadhilla  31.0  2500

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


## HomeWork 12
- Pertemuan 12: Clean a real-world dataset (from Kaggle or another source), perform normalization, handle outliers, and prepare the data for analysis.

In [None]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv('Titanic.csv')

# Display the first few rows of the dataset
print('Data Before Cleaning :')
print(df.head(), '\n')

# Missing Values
print("Missing values before cleaning :")
print(df.isnull().sum())


# perbaikan data
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Sex'] = df['Sex'].str.lower()
df.dropna(subset=['Name'], inplace=True)

# 6. Drop columns that are not useful for analysis (e.g., Ticket, Cabin, PassengerId)
df.drop(columns=['Ticket', 'Cabin', 'PassengerId'], inplace=True)

# Display the cleaned dataframe
print("\nCleaned DataFrame:")
print(df.head())

# 7. Check for any remaining missing values
print("\nMissing values after cleaning:")
print(df.isnull().sum(),'\n')







Data Before Cleaning :
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S   

Missing values before cleaning :
PassengerId      0
Survived 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin