In [1]:
import pandas as pd
import numpy as np

# Task 1: Collect data from two different sources and merge them

# Simulate two data sources as DataFrames
data1 = {
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40]
}

data2 = {
    'ID': [3, 4, 5, 6],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
    'Salary': [70000, 80000, 90000, 95000]
}

# Create DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Merge the datasets on the 'ID' column (common key)
merged_df = pd.merge(df1, df2, on='ID', how='outer')  # 'outer' join to keep all records

# Display the merged dataset
print("Merged Dataset:")
print(merged_df)

# Task 2: Validate the integrity of the collected datasets

# Check for missing values in the merged dataset
missing_values = merged_df.isnull().sum()

# Check for duplicates
duplicate_records = merged_df.duplicated().sum()

# Task 3: Challenges and how they were addressed
print("\nChallenges Faced During Data Collection:")
print("1. Handling missing values: Missing data can occur during data collection. We handled this by using 'outer' join to keep all records.")
print("2. Duplicate records: We checked for duplicates using the 'duplicated' method and removed them if necessary.")
print("3. Inconsistent data formats: We ensured consistency in column data types during the merge.")

# Output missing values and duplicate count
print("\nMissing Values in Dataset:")
print(missing_values)
print("\nNumber of Duplicate Records:")
print(duplicate_records)

Merged Dataset:
   ID     Name   Age         City   Salary
0   1    Alice  25.0          NaN      NaN
1   2      Bob  30.0          NaN      NaN
2   3  Charlie  35.0     New York  70000.0
3   4    David  40.0  Los Angeles  80000.0
4   5      NaN   NaN      Chicago  90000.0
5   6      NaN   NaN      Houston  95000.0

Challenges Faced During Data Collection:
1. Handling missing values: Missing data can occur during data collection. We handled this by using 'outer' join to keep all records.
2. Duplicate records: We checked for duplicates using the 'duplicated' method and removed them if necessary.
3. Inconsistent data formats: We ensured consistency in column data types during the merge.

Missing Values in Dataset:
ID        0
Name      2
Age       2
City      2
Salary    2
dtype: int64

Number of Duplicate Records:
0


In [None]:
# 2. Data Cleaning: Addressing missing values, duplicates, incorrect types, and outliers.
# Task 1: Clean a given dataset and document the changes made.
# Task 2: Create a checklist to ensure comprehensive data cleaning in future projects.
# Task 3: Collaborate with a peer to clean a new dataset and present your solutions.



In [None]:
# 3. Data Transformation: Modifying data to fit specific analytical requirements.
# Task 1: Transform a date column into separate 'day', 'month', and 'year' columns.
# Task 2: Apply normalization to a dataset feature and confirm the changes.
# Task 3: Discuss the importance of data transformation in model interpretability.




In [None]:
# 4. Feature Scaling: Adjusting data features to a common scale.
# Task 1: Apply Min-Max scaling to a dataset.
# Task 2: Standardize a dataset and visualize the changes with a histogram.
# Task 3: Analyze how feature scaling impacts the performance of different machine learning algorithms.





In [None]:
# 5. Feature Engineering: Creating new features from existing ones to improve model accuracy.
# Task 1: Create a new synthetic feature from existing dataset features.
# Task 2: Evaluate the impact of new features on model accuracy.
# Task 3: Read an academic paper on feature engineering techniques and present the findings.


