In [1]:
import pandas as pd

In [2]:
data = {
    "order_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    "customer_name": ["Alice", "Bob", None, "David", "Eva", "Frank", None],
    "product_category": ["Electronics", "Clothing", "Electronics", "Books", None, "Clothing", "Books"],
    "quantity": [2, 1, None, 1, 3, None, 4],
    "unit_price": [299.99, 49.99, 199.99, None, 15.99, 79.99, 12.99],
    "order_date": ["2024-08-01", "2024-08-02", "2024-08-03", "2024-08-04", "2024-08-05", "2024-08-06", None]
}

df = pd.DataFrame(data)
print(df)

   order_id customer_name product_category  quantity  unit_price  order_date
0      1001         Alice      Electronics       2.0      299.99  2024-08-01
1      1002           Bob         Clothing       1.0       49.99  2024-08-02
2      1003          None      Electronics       NaN      199.99  2024-08-03
3      1004         David            Books       1.0         NaN  2024-08-04
4      1005           Eva             None       3.0       15.99  2024-08-05
5      1006         Frank         Clothing       NaN       79.99  2024-08-06
6      1007          None            Books       4.0       12.99        None


# ***1.***

In [3]:
df.isna().sum()

Unnamed: 0,0
order_id,0
customer_name,2
product_category,1
quantity,2
unit_price,1
order_date,1


# ***2***

***відсоток пропущених значень у кожному рядку та стовпчику***

***Чи є такі рядки або стовпчики, в яких доля пропусків більше ніж 50%? Якщо так, видали їх з датасету.***

In [4]:
percent_of_missing_rows = df.isna().sum(axis=1) / df.shape[1] * 100
print("Відсоток пропусків у рядках:\n", percent_of_missing_rows)

Відсоток пропусків у рядках:
 0     0.000000
1     0.000000
2    33.333333
3    16.666667
4    16.666667
5    16.666667
6    33.333333
dtype: float64


In [5]:
percent_of_missing_columns = df.isna().sum(axis=1) / df.shape[0] * 100
print("Відсоток пропусків у стовпчиках:\n", percent_of_missing_columns)

Відсоток пропусків у стовпчиках:
 0     0.000000
1     0.000000
2    28.571429
3    14.285714
4    14.285714
5    14.285714
6    28.571429
dtype: float64


In [6]:
cleaned_missing_rows = df.loc[percent_of_missing_rows <= 50]
cleaned_missing_columns = df.loc[percent_of_missing_columns <= 50]

In [7]:
df

Unnamed: 0,order_id,customer_name,product_category,quantity,unit_price,order_date
0,1001,Alice,Electronics,2.0,299.99,2024-08-01
1,1002,Bob,Clothing,1.0,49.99,2024-08-02
2,1003,,Electronics,,199.99,2024-08-03
3,1004,David,Books,1.0,,2024-08-04
4,1005,Eva,,3.0,15.99,2024-08-05
5,1006,Frank,Clothing,,79.99,2024-08-06
6,1007,,Books,4.0,12.99,


# ***Обробка пропущених значень***

In [8]:
df["customer_name"] = df["customer_name"].fillna("Unknown")

In [9]:
df["product_category"] = df["product_category"].fillna("Miscellaneous")

In [10]:
df["quantity"] = df["quantity"].fillna(df["quantity"].mean())

In [11]:
df["unit_price"] = df["unit_price"].fillna(df["unit_price"].mean())

In [12]:
df["order_date"] = df["order_date"].ffill()

In [13]:
df

Unnamed: 0,order_id,customer_name,product_category,quantity,unit_price,order_date
0,1001,Alice,Electronics,2.0,299.99,2024-08-01
1,1002,Bob,Clothing,1.0,49.99,2024-08-02
2,1003,Unknown,Electronics,2.2,199.99,2024-08-03
3,1004,David,Books,1.0,109.823333,2024-08-04
4,1005,Eva,Miscellaneous,3.0,15.99,2024-08-05
5,1006,Frank,Clothing,2.2,79.99,2024-08-06
6,1007,Unknown,Books,4.0,12.99,2024-08-06
