In [1]:
import pandas as pd
import numpy as np

# Create sample data
data = {'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'age': [25, 30, np.nan, 35, 28],
        'city': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'],
        'salary': [60000, 75000, np.nan, 80000, 65000]}
df = pd.DataFrame(data)

# 1. Handling Missing Values
# Fill missing age with the mean
df['age'] = df['age'].fillna(df['age'].mean())
# Fill missing salary with the median
df['salary'] = df['salary'].fillna(df['salary'].median())

# 2. Data Transformation
# Create a new column 'age_group'
df['age_group'] = pd.cut(df['age'], bins=[20, 30, 40, 50], labels=['20-30', '30-40', '40-50'])

# 3. Data Aggregation
# Calculate average salary by city
avg_salary_by_city = df.groupby('city')['salary'].mean()

# 4. Data Filtering
# Filter data for people aged between 25 and 35
filtered_data = df[(df['age'] >= 25) & (df['age'] <= 35)]

# Display the results
print("Original Data:")
print(df)
print("\nAverage Salary by City:")
print(avg_salary_by_city)
print("\nFiltered Data:")
print(filtered_data)

Original Data:
      name   age         city   salary age_group
0    Alice  25.0     New York  60000.0     20-30
1      Bob  30.0  Los Angeles  75000.0     20-30
2  Charlie  29.5      Chicago  70000.0     20-30
3    David  35.0      Houston  80000.0     30-40
4      Eve  28.0        Miami  65000.0     20-30

Average Salary by City:
city
Chicago        70000.0
Houston        80000.0
Los Angeles    75000.0
Miami          65000.0
New York       60000.0
Name: salary, dtype: float64

Filtered Data:
      name   age         city   salary age_group
0    Alice  25.0     New York  60000.0     20-30
1      Bob  30.0  Los Angeles  75000.0     20-30
2  Charlie  29.5      Chicago  70000.0     20-30
3    David  35.0      Houston  80000.0     30-40
4      Eve  28.0        Miami  65000.0     20-30
