**Task 1**: Checking Null Values for Completeness

**Description**: Verify if there are any null values in a dataset, which indicate incomplete data.

In [1]:
# Write your code from here
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'name': ['Alice', 'Bob', None, 'David'],
    'age': [25, None, 30, 40],
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', None]
})

# Check for null values
null_counts = data.isnull().sum()

# Check if any null values exist
has_nulls = data.isnull().values.any()

print("Null values per column:")
print(null_counts)
print("\nAny null values in dataset?:", has_nulls)

Null values per column:
name     1
age      1
email    1
dtype: int64

Any null values in dataset?: True


**Task 2**: Checking Data Type Validity

**Description**: Ensure that columns contain data of expected types, e.g., ages are integers.

In [3]:
# Wriimport pandas as pd

# Sample dataset
data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 'thirty-five', 40],  # 'thirty-five' is invalid for age
    'email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com']
})

# Define expected types for columns
expected_types = {
    'name': str,
    'age': int,
    'email': str
}

# Function to check data types validity per column
def check_data_types(df, expected_types):
    results = {}
    for col, expected_type in expected_types.items():
        # Check each value in column if it matches expected type
        valid = df[col].apply(lambda x: isinstance(x, expected_type))
        results[col] = valid
    return results

type_validity = check_data_types(data, expected_types)

# Print rows where types are invalid
for col, validity in type_validity.items():
    invalid_rows = data[~validity]
    if not invalid_rows.empty:
        print(f"Invalid data types found in column '{col}':")
        print(invalid_rows[[col]])


Invalid data types found in column 'age':
           age
2  thirty-five


**Task 3**: Verify Uniqueness of Identifiers

**Description**: Check if a dataset has unique identifiers (e.g., emails).

In [4]:
# Write your code from here
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 2],  # user_id 2 is duplicated
    'email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com', 'b@example.com']  # b@example.com duplicated
})

# Function to check uniqueness of a column
def check_uniqueness(df, column):
    duplicated = df[df.duplicated(column, keep=False)]
    if duplicated.empty:
        print(f"All values in '{column}' are unique.")
    else:
        print(f"Duplicate values found in '{column}':")
        print(duplicated[[column]])

# Check uniqueness of 'user_id' and 'email'
check_uniqueness(data, 'user_id')
check_uniqueness(data, 'email')

Duplicate values found in 'user_id':
   user_id
1        2
4        2
Duplicate values found in 'email':
           email
1  b@example.com
4  b@example.com


Task 4: Validate Email Format Using Regex

Description: Validate if email addresses in a dataset have the correct format.

In [5]:
# Write your code from here
import pandas as pd
import re

# Sample dataset
data = pd.DataFrame({
    'email': ['valid.email@example.com', 'invalid-email.com', 'another.valid@mail.co', 'bad@address@domain.com']
})

# Email validation regex pattern
email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')

# Function to validate emails
def validate_emails(df, column):
    df['is_valid_email'] = df[column].apply(lambda x: bool(email_pattern.match(x)))
    invalid_emails = df[df['is_valid_email'] == False]
    if invalid_emails.empty:
        print("All emails are valid.")
    else:
        print("Invalid emails found:")
        print(invalid_emails[[column]])

# Run validation
validate_emails(data, 'email')

Invalid emails found:
                    email
1       invalid-email.com
3  bad@address@domain.com


Task 5: Check for Logical Age Validity

Description: Ensure ages are within a reasonable human range (e.g., 0-120).

In [6]:
# Write your code from here
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'age': [25, -5, 130, 45, 0, 120, 121]
})

# Function to check logical age validity
def check_age_validity(df, column, min_age=0, max_age=120):
    df['is_valid_age'] = df[column].apply(lambda x: isinstance(x, (int, float)) and min_age <= x <= max_age)
    invalid_ages = df[df['is_valid_age'] == False]
    if invalid_ages.empty:
        print("All ages are within the valid range.")
    else:
        print("Invalid ages found:")
        print(invalid_ages[[column]])

# Run check
check_age_validity(data, 'age')

Invalid ages found:
   age
1   -5
2  130
6  121


Task 6: Identify and Handle Missing Data

Description: Identify missing values in a dataset and impute them using a simple strategy (e.g., mean).

In [7]:
# Write your code from here
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = pd.DataFrame({
    'age': [25, np.nan, 35, 40, np.nan, 55]
})

# Identify missing values
missing_count = data['age'].isna().sum()
print(f"Missing values in 'age': {missing_count}")

# Impute missing values with mean
mean_age = data['age'].mean()
data['age_filled'] = data['age'].fillna(mean_age)

print("Data after imputation:")
print(data)

Missing values in 'age': 2
Data after imputation:
    age  age_filled
0  25.0       25.00
1   NaN       38.75
2  35.0       35.00
3  40.0       40.00
4   NaN       38.75
5  55.0       55.00


Task 7: Detect Duplicates

Description: Detect duplicate rows in the dataset.

In [8]:
# Write your code from here
import pandas as pd

# Sample dataset with duplicates
data = pd.DataFrame({
    'id': [1, 2, 2, 3, 4, 4, 4],
    'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'David', 'David']
})

# Detect duplicate rows
duplicates = data[data.duplicated()]

print("Duplicate rows:")
print(duplicates)

Duplicate rows:
   id   name
2   2    Bob
5   4  David
6   4  David


Task 8: Validate Correctness of Numerical Values

Description: Ensure numerical columns are within a specified range.

In [9]:
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'product_id': [101, 102, 103, 104],
    'price': [25.5, 100.0, -5.0, 2000.0]  # price should be between 0 and 1000
})

# Define valid range
min_price, max_price = 0, 1000

# Find rows where price is outside the valid range
invalid_prices = data[(data['price'] < min_price) | (data['price'] > max_price)]

print("Rows with invalid prices:")
print(invalid_prices)


Rows with invalid prices:
   product_id   price
2         103    -5.0
3         104  2000.0


Task 9: Custom Completeness Rule Violation Report

Description: Create a report showing which rows violate specific completeness rules, such as mandatory fields being empty.

In [10]:
# Write your code from here
import pandas as pd

# Sample dataset with some missing mandatory fields: 'name' and 'email'
data = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice', '', 'Charlie', None],
    'email': ['alice@example.com', 'bob@example.com', '', None],
    'phone': ['123-456-7890', '234-567-8901', '345-678-9012', '456-789-0123']
})

# Define mandatory fields
mandatory_fields = ['name', 'email']

# Check completeness violations: rows where any mandatory field is empty or null
violations = data[data[mandatory_fields].isnull().any(axis=1) | (data[mandatory_fields] == '').any(axis=1)]

print("Rows violating completeness rules (missing mandatory fields):")
print(violations)

Rows violating completeness rules (missing mandatory fields):
   customer_id     name            email         phone
1            2           bob@example.com  234-567-8901
2            3  Charlie                   345-678-9012
3            4     None             None  456-789-0123


Task 10: Advanced Regex for Data Validity Check

Description: Check for validity with advanced regex patterns, such as validating complex fields with multi-level rules.

In [11]:
# Write your code from here
import pandas as pd
import re

# Sample data with complex field: a custom ID with format rules:
# Example rule: ID must start with 2 uppercase letters, followed by 4 digits, then a dash, then 3 lowercase letters
data = pd.DataFrame({
    'custom_id': ['AB1234-xyz', 'XY5678-abc', 'A1234-xyz', 'AB12345-xyz', 'AB1234-XYZ', 'AB1234-xy']
})

# Advanced regex pattern for custom ID validation
pattern = re.compile(r'^[A-Z]{2}\d{4}-[a-z]{3}$')

# Validate each ID against the regex pattern
data['valid_custom_id'] = data['custom_id'].apply(lambda x: bool(pattern.match(x)))

print(data)

     custom_id  valid_custom_id
0   AB1234-xyz             True
1   XY5678-abc             True
2    A1234-xyz            False
3  AB12345-xyz            False
4   AB1234-XYZ            False
5    AB1234-xy            False
