In [None]:
# **Homework Assignment 1: Analyzing Sales Data**


# You are given a dataset containing sales data for an e-commerce website. 
# The dataset (`task\sales_data.csv`) has the following columns:

# - `Date`: Date of the sale.
# - `Product`: Name of the product sold.
# - `Category`: Category to which the product belongs.
# - `Quantity`: Number of units sold.
# - `Price`: Price per unit.

import pandas as pd
sales_df = pd.read_csv('task/sales_data.csv')

# **Tasks:**

# 1. Group the data by the `Category` column and calculate the following aggregate statistics for each category:
#    - Total quantity sold.
#    - Average price per unit.
#    - Maximum quantity sold in a single transaction.



print(sales_df.groupby(['Category']).agg({'Quantity': ['sum', 'max'], 'Price': ['mean']}))

# 2. Identify the top-selling product in each category based on the total quantity sold.

# calculating sum per category and product
product_sales = sales_df.groupby(['Category', 'Product'])['Quantity'].agg('sum')
top_products = product_sales.groupby('Category').idxmax().apply(lambda x: x[1])
print(top_products)

# 3. Find the date on which the highest total sales (quantity * price) occurred.

# adding Total_Sales column

sales_df['Total_Sales'] = sales_df['Quantity'] * sales_df['Price']

print(sales_df.groupby(['Date'])['Total_Sales'].agg(['sum']).nlargest(1, 'sum'))



            Quantity           Price
                 sum max        mean
Category                            
Clothing         157  15   31.176471
Electronics      183  15  276.764706
Home             144  14   55.000000
Category
Clothing                 Jeans
Electronics           Smart TV
Home           Pressure Cooker
Name: Quantity, dtype: object
              sum
Date             
2023-01-07  15150


In [146]:
# **Homework Assignment 2: Examining Customer Orders**

# You have a dataset (`task\customer_orders.csv`) containing information about customer orders. 
# The dataset has the following columns:

# - `OrderID`: Unique identifier for each order.
# - `CustomerID`: Unique identifier for each customer.
# - `Product`: Name of the product ordered.
# - `Quantity`: Number of units ordered.
# - `Price`: Price per unit.


orders_df = pd.read_csv('task/customer_orders.csv')

# **Tasks:**

# 1. Group the data by `CustomerID` and filter out customers who have made less than 20 orders.

cust_grouped = orders_df.groupby(['CustomerID'])
cust_counts = cust_grouped['OrderID'].count()
print(cust_counts[cust_counts >= 20])


# 2. Identify customers who have ordered products with an average price per unit greater than $120.



cust_pr_grouped =  orders_df.groupby(['CustomerID', 'Product'])
mean_pr = cust_pr_grouped['Price'].mean()
greater_pr = mean_pr[mean_pr > 120]
customer_ids = greater_pr.index.get_level_values(0).unique()
print(customer_ids)

# 3. Find the total quantity and total price for each product ordered, 
# and filter out products that have a total quantity less than 5 units.
orders_df['Total_Price'] = orders_df['Quantity'] * orders_df['Price']
pr_totals = orders_df.groupby(['Product']).agg({'Quantity' : 'sum', 'Total_Price': 'sum'})
print(pr_totals[pr_totals['Quantity'] >= 5])


CustomerID
101    21
102    21
103    20
104    20
Name: OrderID, dtype: int64
Index([101, 102, 103, 104, 105], dtype='int64', name='CustomerID')
                  Quantity  Total_Price
Product                                
Cargo Pants              6          180
Dress Shirt              5          125
Formal Shirt             6          210
Smartphone               5         2000
Sport Shoes              5          200
Sunglasses               5           75
Wireless Earbuds         6          720


In [192]:
import sqlite3
import pandas as pd
import re


# **Homework Assignment 3: Population Salary Analysis**

# 1. "task\population.db" sqlite database has `population` table.

# Load data from SQLite database
conn = sqlite3.connect('task/population.db')
population_df = pd.read_sql_query("SELECT * FROM population", conn)
conn.close()


# 2. "task\population salary analysis.xlsx" file defines Salary Band categories. <br />
#     Read the data from population table and calculate following measures:
#     - Percentage of population for each salary category;
#     - Average salary in each salary category;
#     - Median salary in each salary category;
#     - Number of population in each salary category;

# Load salary band categories
salary_bands = pd.read_excel('task/population_salary_analysis.xlsx')

# Extract numeric salary ranges dynamically
def extract_salary_range(label):
    numbers = list(map(int, re.findall(r'\d+', label.replace(',', ''))))
    if "over" in label:  # Handles "X and over" cases
        return numbers[0], float('inf')
    elif "till" in label:  # Handles "till X" cases
        return (0, numbers[0])
    return (numbers[0], numbers[1]) if len(numbers) > 1 else (numbers[0], float('inf'))

# Generate bins and labels
ranges = [extract_salary_range(row) for row in salary_bands['Salary Band']]
bins = sorted(set([0] + [r[0] for r in ranges] + [r[1] for r in ranges if r[1] != float('inf')]))

# Remove bins ending in 1
bins = [b for b in bins if b % 10 != 1 or b == float('inf')]
labels = salary_bands['Salary Band'][:len(bins)-1]  # Ensure correct label count

# Ensure bins are strictly increasing
if not all(bins[i] < bins[i + 1] for i in range(len(bins) - 1)):
    raise ValueError("Bins must be strictly increasing. Check salary band definitions.")

# Categorize salaries dynamically
population_df['Salary Band'] = pd.cut(population_df['salary'], bins=bins, labels=labels, right=True)

# Compute overall statistics
grouped = population_df.groupby('Salary Band')['salary'].agg([
    ('Percentage', lambda x: len(x) / len(population_df) * 100),
    ('Average Salary', 'mean'),
    ('Median Salary', 'median'),
    ('Number of Population', 'count')
]).reset_index()



# 3. Calculate the same measures in each State

# Note: Use SQL only to select data from database. All the other calculations should be done in python.


# Compute state-wise statistics
state_grouped = population_df.groupby(['state', 'Salary Band'])['salary'].agg([
    ('Percentage', lambda x: len(x) / len(population_df) * 100),
    ('Average Salary', 'mean'),
    ('Median Salary', 'median'),
    ('Number of Population', 'count')
]).reset_index()


print(grouped)
print(state_grouped)


               Salary Band  Percentage  Average Salary  Median Salary  \
0            till $200,000    9.878980    9.928399e+04        98800.0   
1      $200,001 - $400,000   10.042056    2.995581e+05       299882.0   
2      $400,001 - $600,000   10.591366    4.991640e+05       497925.5   
3      $600,001 - $800,000    9.921895    6.996809e+05       701317.0   
4    $800,001 - $1,000,000   10.084971    9.011523e+05       899845.0   
5  $1,000,001 - $1,200,000   10.531285    1.098524e+06      1097765.0   
6  $1,200,001 - $1,400,000    9.707321    1.300685e+06      1300430.0   
7  $1,400,001 - $1,600,000    9.715904    1.499606e+06      1500623.0   
8  $1,600,001 - $1,800,000    9.612909    1.698519e+06      1697481.5   

   Number of Population  
0                  1151  
1                  1170  
2                  1234  
3                  1156  
4                  1175  
5                  1227  
6                  1131  
7                  1132  
8                  1120  
       st

  grouped = population_df.groupby('Salary Band')['salary'].agg([
  state_grouped = population_df.groupby(['state', 'Salary Band'])['salary'].agg([
