In [None]:
# Exercise 1: Identifying and Handling Missing Data

In [13]:
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

print("Sebelum dibersihkan:\n", df, "\n")

# Filling missing values and dropping rows
df['Age'] = df['Age'].fillna(df['Age'].mean())       
df['Salary'] = df['Salary'].fillna(df['Salary'].median()) 
df = df.dropna(subset=['Name']).reset_index(drop=True)    
print("Setelah dibersihkan:\n", df)

Sebelum dibersihkan:
       Name   Age   Salary
0    Alice  24.0  48000.0
1      Bob  30.0      NaN
2  Charlie   NaN  57000.0
3    David  22.0      NaN
4     None  35.0  60000.0 

Setelah dibersihkan:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [None]:
Exercise 2: Standardizing Categorical Data

In [12]:
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [None]:
Load a dataset of your choice and identify missing values.

In [57]:
import pandas as pd

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print("Dataset Titanic (10 Teratas):\n")
print(df.head(10), "\n")

# Identifikasi missing values
print("Missing values tiap kolom:\n")
print(df.isnull().sum(), "\n")

# Filling missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Cabin'] = df['Cabin'].fillna('unknown')
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

print("Missing Values Tiap Kolom (Setelah Diisi):\n")
print(df.isnull().sum(), "\n")

print("Dataset Titanic (10 Teratas Setelah Diisi):\n")
print(df.head(10))

Dataset Titanic (10 Teratas):

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCa

In [None]:
Implement data transformations to normalize numerical columns.

In [58]:
# Normalisasi kolom numerik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("Kolom numerik:", list(num_cols), "\n")

df[num_cols] = (df[num_cols] - df[num_cols].min()) / (df[num_cols].max() - df[num_cols].min())
print("Kolom Numerik Setelah Normalisasi (10 Teratas):", df[num_cols].head(10))

Kolom numerik: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 

Kolom Numerik Setelah Normalisasi (10 Teratas):    PassengerId  Survived  Pclass       Age  SibSp     Parch      Fare
0     0.000000       0.0     1.0  0.271174  0.125  0.000000  0.014151
1     0.001124       1.0     0.0  0.472229  0.125  0.000000  0.139136
2     0.002247       1.0     1.0  0.321438  0.000  0.000000  0.015469
3     0.003371       1.0     0.0  0.434531  0.125  0.000000  0.103644
4     0.004494       0.0     1.0  0.434531  0.000  0.000000  0.015713
5     0.005618       0.0     1.0  0.367921  0.000  0.000000  0.016510
6     0.006742       0.0     0.0  0.673285  0.000  0.000000  0.101229
7     0.007865       0.0     1.0  0.019854  0.375  0.166667  0.041136
8     0.008989       1.0     1.0  0.334004  0.000  0.333333  0.021731
9     0.010112       1.0     0.5  0.170646  0.125  0.000000  0.058694


In [None]:
Standardize categorical columns and remove duplicates.

In [61]:
# Standarisasi kolom kategorikal
df[cat_cols] = df[cat_cols].apply(lambda x: x.str.strip().str.lower())
print("Hasil Standarisasi Kategorikal (10 Teratas):\n", df[cat_cols].head(10), "\n")

# Hapus duplikat
before = len(df)
df = df.drop_duplicates().reset_index(drop=True)
after = len(df)

print(f"Duplikat yang dihapus: {before - after} baris dihapus.")
print("Dataset akhir:", df.shape)

Hasil Standarisasi Kategorikal (10 Teratas):
                                                 Name     Sex  \
0                            braund, mr. owen harris    male   
1  cumings, mrs. john bradley (florence briggs th...  female   
2                             heikkinen, miss. laina  female   
3       futrelle, mrs. jacques heath (lily may peel)  female   
4                           allen, mr. william henry    male   
5                                   moran, mr. james    male   
6                            mccarthy, mr. timothy j    male   
7                     palsson, master. gosta leonard    male   
8  johnson, mrs. oscar w (elisabeth vilhelmina berg)  female   
9                nasser, mrs. nicholas (adele achem)  female   

             Ticket    Cabin Embarked  
0         a/5 21171  unknown        s  
1          pc 17599      c85        c  
2  ston/o2. 3101282  unknown        s  
3            113803     c123        s  
4            373450  unknown        s  
5        