In [1]:
import pandas as pd

In [2]:
# Creating the DataFrame

data = {
    'id': list(range(1, 21)),  # Employee IDs from 1 to 20
    'name': [
        'Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Henry', 
        'Ivy', 'Jack', 'Kevin', 'Laura', 'Mike', 'Nancy', 'Oliver', 'Paul', 
        'Quinn', 'Rachel', 'Steve', 'Tina'
    ],
    'department': [
        'HR', 'IT', 'Finance', 'IT', 'HR', 'IT', 'Finance', 'IT', 
        'Marketing', 'Sales', 'Finance', 'Sales', 'IT', 'HR', 'Marketing', 'IT', 
        'Finance', 'Sales', 'Marketing', 'HR'
    ],
    'salary': [
        60000, 85000, 75000, 92000, 54000, 48000, 97000, 110000, 
        68000, 73000, 89000, 72000, 95000, 51000, 67000, 88000, 
        93000, 76000, 70000, 59000
    ],
    'joining_date': pd.to_datetime([
        '2021-06-15', '2020-09-23', '2018-02-14', '2019-11-01', '2022-05-10', 
        '2021-07-20', '2017-08-25', '2016-12-05', '2020-04-17', '2019-03-08', 
        '2015-06-25', '2021-09-30', '2018-11-20', '2019-07-15', '2017-02-28', 
        '2016-10-05', '2020-12-11', '2018-05-22', '2019-08-13', '2022-01-19'
    ]),
    'city': [
        'New York', 'San Francisco', 'Chicago', 'Austin', 'New York', 'San Francisco', 'Chicago', 'Austin', 
        'Seattle', 'Boston', 'Chicago', 'Boston', 'Austin', 'New York', 'Seattle', 'San Francisco', 
        'Chicago', 'Boston', 'Seattle', 'New York'
    ]
}

In [3]:
df = pd.DataFrame(data)

#### Total Salary Paid Per Department

In [5]:
#SELECT department, SUM(salary) AS total_salary FROM employees GROUP BY department;
df.groupby('department')['salary'].sum().reset_index().rename(columns = {'salary' : 'total_salary'})

Unnamed: 0,department,total_salary
0,Finance,354000
1,HR,224000
2,IT,518000
3,Marketing,205000
4,Sales,221000


#### Average Salary Per Department (Sorted by Highest)

In [7]:
#SELECT department, AVG(salary) AS avg_salary FROM employees GROUP BY department ORDER BY avg_salary DESC;
df.groupby('department')['salary'].mean().round(2).reset_index().sort_values(by='salary',ascending=False).rename(columns = {'salary' : 'avg_salary'})

Unnamed: 0,department,avg_salary
0,Finance,88500.0
2,IT,86333.33
4,Sales,73666.67
3,Marketing,68333.33
1,HR,56000.0


#### Count of Employees Per City

In [9]:
#SELECT city, COUNT(*) AS employee_count FROM employees GROUP BY city;
df.groupby('city')['id'].count().reset_index().rename(columns = {'id' : 'employee_count'})

Unnamed: 0,city,employee_count
0,Austin,3
1,Boston,3
2,Chicago,4
3,New York,4
4,San Francisco,3
5,Seattle,3


#### Find Departments Where Total Salary is Greater Than 300K

In [11]:
#SELECT department, SUM(salary) AS total_salary FROM employees GROUP BY department HAVING SUM(salary) > 300000;
df.groupby('department')['salary'].sum().reset_index().query('salary > 300000').rename(columns = {'salary' : 'total_salary'})

Unnamed: 0,department,total_salary
0,Finance,354000
2,IT,518000


#### Find Cities With More Than 3 Employees (Using HAVING)

In [13]:
#SELECT city, COUNT(*) AS employee_count FROM employees GROUP BY city HAVING COUNT(*) > 3;
df.groupby('city').agg(employee_count = ('id', 'count')).reset_index().query("employee_count > 3")

Unnamed: 0,city,employee_count
2,Chicago,4
3,New York,4


#### Highest Paid Employee Per Department

In [15]:
#SELECT department, name, salary FROM employees e1 WHERE salary = (SELECT MAX(salary) FROM employees e2 WHERE e1.department = e2.department);
df.loc[df.groupby('department')['salary'].idxmax(),['department','name','salary']].reset_index(drop = True)

Unnamed: 0,department,name,salary
0,Finance,Grace,97000
1,HR,Alice,60000
2,IT,Henry,110000
3,Marketing,Steve,70000
4,Sales,Rachel,76000


#### Number of Employees Who Joined After 2019, Grouped by Department

In [17]:
#SELECT department, COUNT(*) AS employee_count FROM employees GROUP BY department WHERE joining_date > '2019-01-01';
df[df['joining_date'] > '2019-01-01'].groupby('department')['id'].count().reset_index().rename(columns = {'id':'employee_count'})

Unnamed: 0,department,employee_count
0,Finance,1
1,HR,4
2,IT,3
3,Marketing,2
4,Sales,2


#### Average Salary Per Department and City

In [19]:
#SELECT department, city, AVG(salary) AS avg_salary FROM employees GROUP BY department, city;
df.groupby(['department','city'])['salary'].mean().round(2).reset_index().rename(columns={'salary':'avg_salary'})

Unnamed: 0,department,city,avg_salary
0,Finance,Chicago,88500.0
1,HR,New York,56000.0
2,IT,Austin,99000.0
3,IT,San Francisco,73666.67
4,Marketing,Seattle,68333.33
5,Sales,Boston,73666.67


#### Find the department with the highest total salary

In [21]:
#SELECT department, SUM(salary) AS total_salary FROM employees GROUP BY department ORDER BY total_salary DESC LIMIT 1;
df.groupby('department')['salary'].sum().reset_index().nlargest(1, 'salary')

Unnamed: 0,department,salary
2,IT,518000


#### Find percentage of total salary each department contributes

In [23]:
#SELECT department, SUM(salary) AS total_salary, (SUM(salary) * 100.0 / (SELECT SUM(salary) FROM employees)) AS percentage
#FROM employees 
#GROUP BY department 
#ORDER BY percentage DESC;

In [24]:
df_salary = df.groupby('department')['salary'].sum().reset_index()
df_salary['percentage'] = ((df_salary['salary'] / df_salary['salary'].sum()) * 100).round(2)
df_salary.sort_values(by='percentage', ascending=True)

Unnamed: 0,department,salary,percentage
3,Marketing,205000,13.47
4,Sales,221000,14.52
1,HR,224000,14.72
0,Finance,354000,23.26
2,IT,518000,34.03


#### Find the second highest salary in each department

In [26]:
#SELECT name, department, salary 
#FROM employees e1 
#WHERE salary > (SELECT AVG(salary) FROM employees e2 WHERE e1.department = e2.department);

In [27]:
(df[df['salary'] < df.groupby('department')['salary'].transform('max')].groupby('department', as_index=False)['salary']
.max().rename(columns={'salary': 'second_highest_salary'}))

Unnamed: 0,department,second_highest_salary
0,Finance,93000
1,HR,59000
2,IT,95000
3,Marketing,68000
4,Sales,73000
