#mport Libraries

In [21]:
import pandas as pd          # Pandas is used to work with table-like data (DataFrames)
import os                    # OS lets us work with folders and paths


# Create a Storage Folder for Clean Copies

In [22]:
# Create a folder named 'data' if it doesn't already exist
os.makedirs("data", exist_ok=True)


# Analysis of raw_data.csv

In [23]:
print(" First 5 rows of raw_data.csv:\n")
print(raw_df.head())


 First 5 rows of raw_data.csv:

   order_id customer_name product  quantity  unit_price  order_date region
0         1         Diana  Tablet       NaN       500.0  2024-01-20  South
1         2           Eve  Laptop       NaN         NaN  2024-04-29  North
2         3       Charlie  Laptop       2.0       250.0  2024-01-08    NaN
3         4           Eve  Laptop       2.0       750.0  2024-01-07   West
4         5           Eve  Tablet       3.0         NaN  2024-03-07  South


In [24]:
print("\n Structure and column info for raw_data.csv:\n")
print(raw_df.info())



 Structure and column info for raw_data.csv:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       100 non-null    int64  
 1   customer_name  99 non-null     object 
 2   product        100 non-null    object 
 3   quantity       74 non-null     float64
 4   unit_price     65 non-null     float64
 5   order_date     99 non-null     object 
 6   region         75 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB
None


In [25]:
print("\n Missing values in raw_data.csv:\n")
print(raw_df.isnull().sum())



 Missing values in raw_data.csv:

order_id          0
customer_name     1
product           0
quantity         26
unit_price       35
order_date        1
region           25
dtype: int64


In [26]:
print("\n Duplicate rows in raw_data.csv:", raw_df.duplicated().sum())



 Duplicate rows in raw_data.csv: 1


# Analysis of incremental_data.csv

In [27]:
print("\n\n======  ANALYSIS: INCREMENTAL DATA ======\n")
print(" First 5 rows of incremental_data.csv:\n")
print(incr_df.head())





 First 5 rows of incremental_data.csv:

   order_id customer_name product  quantity  unit_price  order_date   region
0       101         Alice  Laptop       NaN       900.0  2024-05-09  Central
1       102           NaN  Laptop       1.0       300.0  2024-05-07  Central
2       103           NaN  Laptop       1.0       600.0  2024-05-04  Central
3       104           NaN  Tablet       NaN       300.0  2024-05-26  Central
4       105         Heidi  Tablet       2.0       600.0  2024-05-21    North


In [28]:
print("\n Structure and column info for incremental_data.csv:\n")
print(incr_df.info())



 Structure and column info for incremental_data.csv:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       10 non-null     int64  
 1   customer_name  4 non-null      object 
 2   product        10 non-null     object 
 3   quantity       6 non-null      float64
 4   unit_price     10 non-null     float64
 5   order_date     10 non-null     object 
 6   region         8 non-null      object 
dtypes: float64(2), int64(1), object(4)
memory usage: 692.0+ bytes
None


In [29]:
print("\n Missing values in incremental_data.csv:\n")
print(incr_df.isnull().sum())



 Missing values in incremental_data.csv:

order_id         0
customer_name    6
product          0
quantity         4
unit_price       0
order_date       0
region           2
dtype: int64


In [30]:
print("\n🔹 Duplicate rows in incremental_data.csv:", incr_df.duplicated().sum())



🔹 Duplicate rows in incremental_data.csv: 0


# Save Clean Copies of the Raw Data

In [31]:
# Save a copy of the raw data (unmodified) for backup
raw_df.to_csv("data/raw_data_cleaned.csv", index=False)
print("\n raw_data_cleaned.csv saved to 'data/'")

# Save a copy of the incremental data
incr_df.to_csv("data/incremental_data_cleaned.csv", index=False)
print("incremental_data_cleaned.csv saved to 'data/'")



 raw_data_cleaned.csv saved to 'data/'
incremental_data_cleaned.csv saved to 'data/'


#  Initial Observations

In [32]:
print("""
Initial Observations:

raw_data.csv:
- There is 1 missing value in 'customer_name'.
- 'quantity' has 26 missing entries, and 'unit_price' is missing in 35 rows.
- 'order_date' has 1 missing value.
- There is 1 exact duplicate row that will need to be removed.

incremental_data.csv:
- 'customer_name' has 6 missing values.
- 'quantity' is missing in 4 rows.
- 'region' is missing in 25 rows.
- No duplicate rows were found in this dataset.

- Both datasets have similar structure, which will make merging easier.
- Missing values in key fields like 'unit_price', 'quantity', and 'region' should be addressed in the data cleaning step.
""")



Initial Observations:

raw_data.csv:
- There is 1 missing value in 'customer_name'.
- 'quantity' has 26 missing entries, and 'unit_price' is missing in 35 rows.
- 'order_date' has 1 missing value.
- There is 1 exact duplicate row that will need to be removed.

incremental_data.csv:
- 'customer_name' has 6 missing values.
- 'quantity' is missing in 4 rows.
- 'region' is missing in 25 rows.
- No duplicate rows were found in this dataset.

- Both datasets have similar structure, which will make merging easier.
- Missing values in key fields like 'unit_price', 'quantity', and 'region' should be addressed in the data cleaning step.

