In [2]:
import numpy as np
import pandas as pd
import duckdb

def sqldf(query) :
    return duckdb.sql(query).df()


In [3]:
#1. Create the dataframe
data = {
 'EmployeeID': [101, 102, 103, 104, 105, 106, 107],
 'LastName': ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller'],
 'Department': ['Sales', 'IT', 'Marketing', 'Sales', 'HR', None, 'IT'],
 'Salary': [65000.00, 95000.00, 55000.00, 72000.00, 80000.00, 48000.00, 110000.00],
 'HireDate': ['2023-01-15', '2024-06-01', '2023-08-20', '2023-10-10', '2024-03-05', '2022-11-22', '2025-01-01']
}

Employees = pd.DataFrame(data)

Employees

Unnamed: 0,EmployeeID,LastName,Department,Salary,HireDate
0,101,Smith,Sales,65000.0,2023-01-15
1,102,Johnson,IT,95000.0,2024-06-01
2,103,Williams,Marketing,55000.0,2023-08-20
3,104,Brown,Sales,72000.0,2023-10-10
4,105,Jones,HR,80000.0,2024-03-05
5,106,Garcia,,48000.0,2022-11-22
6,107,Miller,IT,110000.0,2025-01-01


In [4]:
#Define a simple SQL query

sql_query = """SELECT * FROM Employees;"""

In [5]:
result_df = sqldf(sql_query)

print(result_df)

   EmployeeID  LastName Department    Salary    HireDate
0         101     Smith      Sales   65000.0  2023-01-15
1         102   Johnson         IT   95000.0  2024-06-01
2         103  Williams  Marketing   55000.0  2023-08-20
3         104     Brown      Sales   72000.0  2023-10-10
4         105     Jones         HR   80000.0  2024-03-05
5         106    Garcia       None   48000.0  2022-11-22
6         107    Miller         IT  110000.0  2025-01-01


In [6]:
sql_query = """SELECT EmployeeID,Salary
FROM Employees;"""

In [7]:
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID    Salary
0         101   65000.0
1         102   95000.0
2         103   55000.0
3         104   72000.0
4         105   80000.0
5         106   48000.0
6         107  110000.0


In [8]:
sql_query = """SELECT * FROM Employees WHERE Department = 'Sales';"""

In [9]:
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID LastName Department   Salary    HireDate
0         101    Smith      Sales  65000.0  2023-01-15
1         104    Brown      Sales  72000.0  2023-10-10


In [10]:
#Find all the employees in the IT dept with a salary greater than 100,000

sql_query = """SELECT * FROM Employees WHERE Department = 'IT' AND Salary > 100000.00;"""

In [11]:
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID LastName Department    Salary    HireDate
0         107   Miller         IT  110000.0  2025-01-01


In [12]:
#filter all the employees in HR department OR joined after '2024-01-01'
sql_query = """SELECT * FROM Employees WHERE Department = 'HR' OR HireDate = '2024-01-01';"""

In [13]:
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID LastName Department   Salary    HireDate
0         105    Jones         HR  80000.0  2024-03-05


In [14]:
# Filter employees where dept is not engineering
sql_query = """SELECT * FROM Employees WHERE NOT Department = 'IT';"""

In [15]:
result_df = sqldf(sql_query)
print(sql_query)

SELECT * FROM Employees WHERE NOT Department = 'IT';


# Filtering with special operators

In [16]:
#find employees whose salary is between 50000 and 75000

sql_query = """SELECT * FROM Employees WHERE Salary BETWEEN 50000 AND 75000;"""
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID  LastName Department   Salary    HireDate
0         101     Smith      Sales  65000.0  2023-01-15
1         103  Williams  Marketing  55000.0  2023-08-20
2         104     Brown      Sales  72000.0  2023-10-10


In [17]:

sql_query = """SELECT * FROM Employees WHERE Department IN ('IT','Sales','HR');"""
result_df = sqldf(sql_query)
print(result_df)

   EmployeeID LastName Department    Salary    HireDate
0         101    Smith      Sales   65000.0  2023-01-15
1         102  Johnson         IT   95000.0  2024-06-01
2         104    Brown      Sales   72000.0  2023-10-10
3         105    Jones         HR   80000.0  2024-03-05
4         107   Miller         IT  110000.0  2025-01-01


# Pattern matching (LIKE)

In [18]:
sql_query = """SELECT LastName, Department, Salary FROM Employees
WHERE LastName LIKE 'J%' OR Department = 'Marketing';"""

result_df = sqldf(sql_query)
print(result_df)

   LastName Department   Salary
0   Johnson         IT  95000.0
1  Williams  Marketing  55000.0
2     Jones         HR  80000.0


# Cleaning Data

In [19]:
import pandas as pd

data = {
'CustomerID': [1, 2, 3, 4, 5],
'Name': [' alice  ', 'BOB SMITH', 'Charlie.D', 'dave ', 'eve '],
'ProductCode': ['A-101-L', 'b-202-XL', 'C-303-S',  'A-101-M ', 'D-404-M'],
'Email': [' aLice@mail.com', 'BOB@work.net ', 'charLIE@web.io ', 
'dave@corp.co ', 'eve@home.net']
}

customer_data = pd.DataFrame(data)

customer_data

Unnamed: 0,CustomerID,Name,ProductCode,Email
0,1,alice,A-101-L,aLice@mail.com
1,2,BOB SMITH,b-202-XL,BOB@work.net
2,3,Charlie.D,C-303-S,charLIE@web.io
3,4,dave,A-101-M,dave@corp.co
4,5,eve,D-404-M,eve@home.net


In [20]:
#TRIM
sql_munging_1 = """SELECT CustomerID, UPPER(TRIM(Name)) AS Standardized_Name,
LOWER(Email) AS Cleaned_Email FROM customer_data;"""

df_standardized = sqldf(sql_munging_1)
print(df_standardized)

   CustomerID Standardized_Name    Cleaned_Email
0           1             ALICE   alice@mail.com
1           2         BOB SMITH    bob@work.net 
2           3         CHARLIE.D  charlie@web.io 
3           4              DAVE    dave@corp.co 
4           5               EVE     eve@home.net


In [21]:
#LTRIM AND RTRIM
sql_munging_2 = """SELECT CustomerID,RTRIM(Name) AS Right_trimmed_Name,
LTRIM(Email) AS Left_Trimmed_Email FROM customer_data;"""

df_standardized = sqldf(sql_munging_2)
print(df_standardized)

   CustomerID Right_trimmed_Name Left_Trimmed_Email
0           1              alice     aLice@mail.com
1           2          BOB SMITH      BOB@work.net 
2           3          Charlie.D    charLIE@web.io 
3           4               dave      dave@corp.co 
4           5                eve       eve@home.net


In [22]:
#REPLACE - Replaces all occurences of a substring with someother string
#REPLACE 'L' with 'XL'

sql_munging_3 = """SELECT CustomerID,ProductCode,REPLACE(REPLACE(ProductCode,'L','XL'),'-','') AS Cleaned_ProductCode
FROM customer_data;"""

df_standardized = sqldf(sql_munging_3)
print(df_standardized)

   CustomerID ProductCode Cleaned_ProductCode
0           1     A-101-L              A101XL
1           2    b-202-XL             b202XXL
2           3     C-303-S               C303S
3           4    A-101-M               A101M 
4           5     D-404-M               D404M


In [23]:
sql_munging_4 = """SELECT CustomerID, ProductCode, LEFT(ProductCode,3) AS Item_Type_Prefix,
RIGHT(Email,4) AS Domain_Suffix FROM customer_data;"""

df_standardized = sqldf(sql_munging_4)
print(df_standardized)

   CustomerID ProductCode Item_Type_Prefix Domain_Suffix
0           1     A-101-L              A-1          .com
1           2    b-202-XL              b-2          net 
2           3     C-303-S              C-3          .io 
3           4    A-101-M               A-1          .co 
4           5     D-404-M              D-4          .net


In [24]:
sql_munging_4_corrected = """SELECT CustomerID, ProductCode, SUBSTR(ProductCode,1,3) AS Item_Type_Prefix,
SUBSTR(Email,-4) AS Domain_Suffix FROM customer_data;"""

df_standardized = sqldf(sql_munging_4_corrected)
print(df_standardized)

   CustomerID ProductCode Item_Type_Prefix Domain_Suffix
0           1     A-101-L              A-1          .com
1           2    b-202-XL              b-2          net 
2           3     C-303-S              C-3          .io 
3           4    A-101-M               A-1          .co 
4           5     D-404-M              D-4          .net


In [25]:
data = {'city': ['NY', 'LA', 'NY', 'LA', 'Chicago'],
'sales': [100, 150, 120, 90, 80]}

df = pd.DataFrame(data)

df

Unnamed: 0,city,sales
0,NY,100
1,LA,150
2,NY,120
3,LA,90
4,Chicago,80


In [26]:
#City wise sum up the sale and arrange in descending order
sql_query = """SELECT City,SUM(sales) AS Total_sales FROM df 
GROUP BY city
ORDER BY total_sales DESC"""

result_df = sqldf(sql_query)

print(result_df)

      city  Total_sales
0       LA        240.0
1       NY        220.0
2  Chicago         80.0


In [27]:
students_df = pd.DataFrame({
'student_id': [1, 2, 3, 4, 5],
'name': ['Zoe', 'Liam', 'Noah', 'Emma', 'Ava'],
'score': [92, 45, 78, 88, 55]
})

In [28]:
sql_case_logic = """SELECT name,score,
CASE
   WHEN score >= 90 THEN 'A'
   WHEN score >= 80 THEN 'B'
   WHEN score >= 70 THEN 'C'
   WHEN score >= 60 THEN 'D'
   ELSE 'F'
END AS letter_grade,
CASE 
   WHEN score >= 60 THEN 'Pass'
   ELSE 'Fail'
END AS status
FROM students_df; """

result_df = sqldf(sql_case_logic)
print(result_df)

   name  score letter_grade status
0   Zoe     92            A   Pass
1  Liam     45            F   Fail
2  Noah     78            C   Pass
3  Emma     88            B   Pass
4   Ava     55            F   Fail


In [30]:
data = {
 'product': ['Laptop', 'Mouse', 'Monitor', 'Keyboard'],
 'sale_price': [1200, None, 300, None],
 'suggested_price':[1150, 25, None, 70],
 'default_price': [1000, 20, 250, 50]
}

df = pd.DataFrame(data)
df

Unnamed: 0,product,sale_price,suggested_price,default_price
0,Laptop,1200.0,1150.0,1000
1,Mouse,,25.0,20
2,Monitor,300.0,,250
3,Keyboard,,70.0,50


# COALESCE

In [32]:
sql_query = """SELECT
            product,sale_price,suggested_price,
            COALESCE(sale_price,suggested_price) AS final_price
            FROM df;"""

result_df = sqldf(sql_query)
print(result_df)

    product  sale_price  suggested_price  final_price
0    Laptop      1200.0           1150.0       1200.0
1     Mouse         NaN             25.0         25.0
2   Monitor       300.0              NaN        300.0
3  Keyboard         NaN             70.0         70.0


In [None]:
sql_query = """SELECT
            product,sale_price,suggested_price,
            COALESCE(sale_price,1000) AS sale_price
            FROM df;"""

result_df = sqldf(sql_query)
print(result_df)


# NULLIF

In [35]:
df = pd.DataFrame({
 'product': ['Laptop', 'Mouse', 'Monitor', 'Keyboard'],
 'revenue': [5000, 200, 1500, 0],
 'units_sold': [10, 5, 0, 0],# 0 will cause division error
 'category': ['Electronics', '', 'Electronics', 'Unknown']
})
df

Unnamed: 0,product,revenue,units_sold,category
0,Laptop,5000,10,Electronics
1,Mouse,200,5,
2,Monitor,1500,0,Electronics
3,Keyboard,0,0,Unknown


In [36]:
sql_query = """SELECT product,
                revenue / NULLIF(units_sold,0) AS price_per_unit,
                NULLIF(category,'') AS cleaned_category FROM df;"""

result_df = sqldf(sql_query)
print(result_df)

    product  price_per_unit cleaned_category
0    Laptop           500.0      Electronics
1     Mouse            40.0             None
2   Monitor             NaN      Electronics
3  Keyboard             NaN          Unknown
