In [2]:
import pandas as pd

# Load the sales data
sales_df = pd.read_csv('task\\sales_data.csv')

# Group by Category and aggregate
category_stats = sales_df.groupby('Category').agg(
    total_quantity_sold=('Quantity', 'sum'),
    average_price=('Price', 'mean'),
    max_quantity_per_transaction=('Quantity', 'max')
)
print("\nCategory Statistics:\n", category_stats)

# Top-selling product in each category by total quantity sold
top_products = (
    sales_df.groupby(['Category', 'Product'])['Quantity']
    .sum()
    .reset_index()
    .sort_values(['Category', 'Quantity'], ascending=[True, False])
    .drop_duplicates('Category')
)
print("\nTop-selling products by category:\n", top_products)

# Find the date with highest total sales (Quantity * Price)
sales_df['TotalSales'] = sales_df['Quantity'] * sales_df['Price']
highest_sales_date = sales_df.groupby('Date')['TotalSales'].sum().idxmax()
print("\nDate with highest total sales:", highest_sales_date)



Category Statistics:
              total_quantity_sold  average_price  max_quantity_per_transaction
Category                                                                     
Electronics                   45          870.0                             7

Top-selling products by category:
       Category Product  Quantity
1  Electronics   Phone        21

Date with highest total sales: 2023-01-10


In [1]:
import pandas as pd
import os

# Ensure the folder exists
os.makedirs('task', exist_ok=True)

# File path
filepath = 'task\\sales_data.csv'

# If the file doesn't exist, create a sample dataset
if not os.path.exists(filepath):
    sample_data = {
        'Date': pd.date_range(start='2023-01-01', periods=12),
        'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Tablet'],
        'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics'],
        'Quantity': [3, 5, 2, 4, 6, 3, 2, 7, 1, 5, 3, 4],
        'Price': [1200, 800, 600, 1150, 850, 620, 1250, 780, 590, 1180, 810, 610]
    }
    sample_df = pd.DataFrame(sample_data)
    sample_df.to_csv(filepath, index=False)
    print("✅ Sample sales_data.csv file created.")
else:
    print("📁 File already exists. Proceeding to analysis.")

# Load the dataset
sales_df = pd.read_csv(filepath)

# Group by Category and aggregate
category_stats = sales_df.groupby('Category').agg(
    total_quantity_sold=('Quantity', 'sum'),
    average_price=('Price', 'mean'),
    max_quantity_per_transaction=('Quantity', 'max')
)
print("\n📊 Category Statistics:\n", category_stats)

# Top-selling product in each category by total quantity sold
top_products = sales_df.groupby(['Category', 'Product'])['Quantity'].sum().reset_index()
top_selling = top_products.sort_values(['Category', 'Quantity'], ascending=[True, False]).drop_duplicates('Category')
print("\n🏆 Top-selling product in each category:\n", top_selling)

# Add a 'Total_Sale' column (Quantity * Price)
sales_df['Total_Sale'] = sales_df['Quantity'] * sales_df['Price']

# Find the date with the highest total sales
total_by_date = sales_df.groupby('Date')['Total_Sale'].sum()
max_sales_date = total_by_date.idxmax()
print(f"\n📅 Date with highest total sales: {max_sales_date}")


✅ Sample sales_data.csv file created.

📊 Category Statistics:
              total_quantity_sold  average_price  max_quantity_per_transaction
Category                                                                     
Electronics                   45          870.0                             7

🏆 Top-selling product in each category:
       Category Product  Quantity
1  Electronics   Phone        21

📅 Date with highest total sales: 2023-01-10


In [4]:
# Load customer orders data
orders_df = pd.read_csv('task\\customer_orders.csv')

# Group by CustomerID and count orders
customer_order_counts = orders_df.groupby('CustomerID')['OrderID'].count()
active_customers = customer_order_counts[customer_order_counts >= 20].index
filtered_orders = orders_df[orders_df['CustomerID'].isin(active_customers)]
print("\nCustomers with 20 or more orders:\n", filtered_orders['CustomerID'].unique())

# Customers with avg price > $120
avg_price_by_customer = orders_df.groupby('CustomerID')['Price'].mean()
high_spenders = avg_price_by_customer[avg_price_by_customer > 120].index
print("\nCustomers with avg price > $120:\n", high_spenders)

# Total quantity and price per product, filtered by quantity >= 5
product_stats = orders_df.groupby('Product').agg(
    total_quantity=('Quantity', 'sum'),
    total_price=('Price', 'sum')
).reset_index()

products_over_5 = product_stats[product_stats['total_quantity'] >= 5]
print("\nProducts with total quantity >= 5:\n", products_over_5)



Customers with 20 or more orders:
 []

Customers with avg price > $120:
 Index([201, 202, 203, 205, 207, 208, 209], dtype='int64', name='CustomerID')

Products with total quantity >= 5:
     Product  total_quantity  total_price
0    Camera               5         2000
1  Keyboard              10          500
2    Laptop              10         5000
3   Monitor               5         1000
4     Mouse              20          250
5     Phone               5         4000
6   Printer              10         1500
7    Router              15          750
8   Speaker               5          600
9    Tablet              15         3000


In [3]:
import pandas as pd
import os

# Ensure the folder exists
os.makedirs('task', exist_ok=True)

# File path
file_path = 'task\\customer_orders.csv'

# Step 1: Create sample file if not present
if not os.path.exists(file_path):
    sample_data = {
        'OrderID': range(1001, 1051),  # 50 orders
        'CustomerID': [201, 202, 203, 204, 205, 206, 207, 208, 209, 210] * 5,
        'Product': ['Laptop', 'Phone', 'Tablet', 'Mouse', 'Monitor', 'Keyboard', 'Printer', 'Camera', 'Router', 'Speaker'] * 5,
        'Quantity': [2, 1, 3, 4, 1, 2, 2, 1, 3, 1] * 5,
        'Price': [1000, 800, 600, 50, 200, 100, 300, 400, 150, 120] * 5
    }
    sample_df = pd.DataFrame(sample_data)
    sample_df.to_csv(file_path, index=False)
    print("✅ Sample customer_orders.csv created.")
else:
    print("📁 File already exists. Proceeding to analysis.")

# Step 2: Load the data
orders_df = pd.read_csv(file_path)

# 1. Customers with 20 or more orders
customer_order_counts = orders_df.groupby('CustomerID')['OrderID'].count()
active_customers = customer_order_counts[customer_order_counts >= 20]
print("\n👥 Customers with 20 or more orders:\n", active_customers)

# 2. Customers with avg price per unit > $120
avg_price_per_customer = orders_df.groupby('CustomerID')['Price'].mean()
high_spenders = avg_price_per_customer[avg_price_per_customer > 120]
print("\n💸 Customers with avg unit price > $120:\n", high_spenders)

# 3. Total quantity and total price per product, filter quantity < 5
product_summary = orders_df.groupby('Product').agg(
    total_quantity=('Quantity', 'sum'),
    total_revenue=('Price', 'sum')
)
low_quantity_products = product_summary[product_summary['total_quantity'] < 5]
print("\n📦 Products with total quantity < 5:\n", low_quantity_products)


✅ Sample customer_orders.csv created.

👥 Customers with 20 or more orders:
 Series([], Name: OrderID, dtype: int64)

💸 Customers with avg unit price > $120:
 CustomerID
201    1000.0
202     800.0
203     600.0
205     200.0
207     300.0
208     400.0
209     150.0
Name: Price, dtype: float64

📦 Products with total quantity < 5:
 Empty DataFrame
Columns: [total_quantity, total_revenue]
Index: []


In [4]:
import sqlite3
import pandas as pd

# Connect to SQLite DB and load population table
conn = sqlite3.connect("task\\population.db")
population_df = pd.read_sql_query("SELECT * FROM population", conn)
conn.close()

# Load salary bands from Excel
salary_bands = pd.read_excel("task\\population salary analysis.xlsx")

# Assume salary_bands has columns: 'Band', 'Min', 'Max'
def assign_salary_band(salary):
    row = salary_bands[(salary_bands['Min'] <= salary) & (salary_bands['Max'] >= salary)]
    if not row.empty:
        return row['Band'].values[0]
    return 'Unknown'

# Apply salary band
population_df['SalaryBand'] = population_df['Salary'].apply(assign_salary_band)

# Stats per salary band
band_stats = population_df.groupby('SalaryBand').agg(
    population_count=('Salary', 'count'),
    percent_of_population=('Salary', lambda x: 100 * len(x) / len(population_df)),
    average_salary=('Salary', 'mean'),
    median_salary=('Salary', 'median')
).reset_index()
print("\nSalary Band Stats:\n", band_stats)

# Stats per state within each salary band
state_band_stats = population_df.groupby(['State', 'SalaryBand']).agg(
    population_count=('Salary', 'count'),
    percent_of_state_population=('Salary', lambda x: 100 * len(x) / len(population_df)),
    average_salary=('Salary', 'mean'),
    median_salary=('Salary', 'median')
).reset_index()
print("\nState-wise Salary Band Stats:\n", state_band_stats)


OperationalError: unable to open database file