# Data Transformation Operations in Python
This notebook demonstrates common data manipulation operations using a fictional dataset `pathology_test_data`. Each section includes an explanation and a code example.

## Filter
Filtering a dataset means selecting rows that satisfy a logical condition.

In [None]:
# Example: Filter rows where blood_test_name equals 'Haemoglobin' and year is 2010
import pandas as pd

# Sample data
data = {
    'date': pd.to_datetime(['2010-01-01', '2011-06-23', '2010-07-15']),
    'blood_test_name': ['Haemoglobin', 'Cholesterol', 'Haemoglobin'],
    'blood_test_result': [13.5, 5.2, 14.0]
}
pathology_test_data = pd.DataFrame(data)

# Filter
filtered_data = pathology_test_data[
    (pathology_test_data['date'].dt.year == 2010) &
    (pathology_test_data['blood_test_name'] == 'Haemoglobin')
]
filtered_data

## Select
Select a subset of columns from the dataset.

In [None]:
# Select specific columns
selected_data = pathology_test_data[['blood_test_name', 'blood_test_result']]
selected_data

## Functions
A function takes inputs and produces a defined output. For example, a function to calculate BMI:

In [None]:
# Function to calculate BMI
def calculate_bmi(weight, height):
    return weight / (height ** 2)

# Example
calculate_bmi(70, 1.75)

## Group By
Group data by a column and aggregate values within each group.

In [None]:
# Group by blood_test_name and get the max result
grouped = pathology_test_data.groupby('blood_test_name')['blood_test_result'].max()
grouped

## Sort
Sort the dataset by a specific column.

In [None]:
# Sort by date in ascending order
sorted_data = pathology_test_data.sort_values(by='date', ascending=True)
sorted_data

## Add Columns
Add a new column derived from existing columns.

In [None]:
# Add a new column with the month name
pathology_test_data['month_name'] = pathology_test_data['date'].dt.month_name()
pathology_test_data

## Join
Join two datasets on a common key.

In [None]:
# Create sample postcode dataset
postcodes = pd.DataFrame({
    'patientid': [1, 2, 3],
    'postcode': ['2000', '3000', '4000']
})

# Add patientid to pathology_test_data for join example
pathology_test_data['patientid'] = [1, 2, 3]

# Left Join
left_join = pathology_test_data.merge(postcodes, how='left', on='patientid')
left_join

In [None]:
# Right Join
right_join = pathology_test_data.merge(postcodes, how='right', on='patientid')
right_join

In [None]:
# Inner Join
inner_join = pathology_test_data.merge(postcodes, how='inner', on='patientid')
inner_join

In [None]:
# Outer Join
outer_join = pathology_test_data.merge(postcodes, how='outer', on='patientid')
outer_join