In [102]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

You have a dataset containing sales information, but it has missing values and incorrect data. Your task is to:

Load the data from a CSV file.

Handle missing values appropriately.

Fix incorrect data entries.

Convert some columns to the correct format.

Add a new column based on calculations.

Perform basic analysis on the dataset.

In [103]:
df=pd.read_csv("sales.csv")


df

Unnamed: 0,OrderID,Product,Category,Price,Quantity,Total,Date,Customer,Payment_Method
0,101,Laptop,Electronics,1200.0,2,,2024-01-05,Ali,Credit Card
1,102,Phone,Electronics,800.0,1,,2024-01-06,Omar,PayPal
2,103,TV,Electronics,,1,,2024-01-07,Mohammed,Cash
3,104,Shoes,Fashion,100.0,2,,2024-01-08,Sara,Credit Card
4,105,,Fashion,50.0,3,,2024-01-09,Aisha,PayPal
5,106,Watch,Accessories,200.0,1,200.0,2024-01-10,Hassan,Cash


🛠 2. Data Cleaning
Fill missing values in Total by calculating (Price * Quantity).

Replace missing values in Product with "Unknown" or a suitable value.

In [104]:
df["Total"]=df["Price"]*df["Quantity"]

df["Total"]=df["Total"].replace(np.nan,0)

# Replace missing Product names
df["Product"] = df["Product"].fillna("Unknown")


df["Product"]=df["Product"].fillna("newProduct")

In [107]:

df["Price"]=df["Price"].astype("float")



In [108]:
# Fill missing Price values with category-wise mean

df["Price"] = df.groupby("Category")["Price"].transform(lambda x: x.fillna(x.mean()))



In [109]:
# Convert Date column to datetime

df["Date"]=pd.to_datetime(df["Date"])

In [110]:
df["Date"]

0   2024-01-05
1   2024-01-06
2   2024-01-07
3   2024-01-08
4   2024-01-09
5   2024-01-10
Name: Date, dtype: datetime64[ns]

In [111]:
category_sales=df.groupby("Category")["Total"].sum()



In [112]:
best_selling_product = df.groupby("Product")["Quantity"].sum().idxmax()
best_selling_product


'Unknown'

In [113]:
df

Unnamed: 0,OrderID,Product,Category,Price,Quantity,Total,Date,Customer,Payment_Method
0,101,Laptop,Electronics,1200.0,2,2400.0,2024-01-05,Ali,Credit Card
1,102,Phone,Electronics,800.0,1,800.0,2024-01-06,Omar,PayPal
2,103,TV,Electronics,1000.0,1,0.0,2024-01-07,Mohammed,Cash
3,104,Shoes,Fashion,100.0,2,200.0,2024-01-08,Sara,Credit Card
4,105,Unknown,Fashion,50.0,3,150.0,2024-01-09,Aisha,PayPal
5,106,Watch,Accessories,200.0,1,200.0,2024-01-10,Hassan,Cash


In [119]:
payment_method_distribution = df["Payment_Method"].value_counts(normalize=True) * 100

payment_method_distribution

Payment_Method
Credit Card    33.333333
PayPal         33.333333
Cash           33.333333
Name: proportion, dtype: float64