In [3]:
# Importance of Data Cleaning

# 1. Missing Values: Missing data points in a dataset can lead to biased results.
#     Task 1: Load a dataset and identify which columns have missing values.
#     Task 2: Replace missing values in a dataset with the column mean or mode.
#     Task 3: Compare model performance with and without handling missing values.
    


# Ques_2.ipynb
# Module 5: Introduction to Data Preprocessing & Cleaning
# Task: Types of Data Issues

import pandas as pd
import numpy as np

# Sample dataset with various data issues
data = {
    'ID': [1, 2, 2, 4, 5, 6, 7, 8],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eve', None, 'Grace'],
    'Age': [25, 30, 30, 22, None, 29, 28, 27],
    'City': ['New York', 'new york', 'new york', 'Los Angeles', 'Chicago', None, 'Boston', 'Boston'],
    'Signup Date': ['2021-01-01', '01/02/2021', '01/02/2021', '2021-03-04', '03-15-2021', '2021/04/01', '', '2021-05-01'],
    'Score': [85, 90, 90, 88, 9999, 87, 89, 88]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# ---------------------------
# 1. Missing Data
# ---------------------------
print("\n--- Missing Data ---")
print(df.isnull().sum())

# Fill missing Age with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing City with 'Unknown'
df['City'].fillna('Unknown', inplace=True)

# Fill missing Name with 'NoName'
df['Name'].fillna('NoName', inplace=True)

# ---------------------------
# 2. Duplicate Data
# ---------------------------
print("\n--- Duplicates ---")
print(df.duplicated())

# Drop duplicates
df = df.drop_duplicates()

# ---------------------------
# 3. Inconsistent Data
# ---------------------------
print("\n--- Before Standardizing City ---")
print(df['City'].value_counts())

# Standardize city names
df['City'] = df['City'].str.lower().str.strip()

print("\n--- After Standardizing City ---")
print(df['City'].value_counts())

# ---------------------------
# 4. Outliers
# ---------------------------
print("\n--- Outlier Detection in Score ---")
print(df['Score'].describe())

# Replace outlier score (9999) with median
median_score = df[df['Score'] < 100]['Score'].median()
df['Score'] = df['Score'].apply(lambda x: median_score if x > 1000 else x)

# ---------------------------
# 5. Incorrect Data Types
# ---------------------------
print("\n--- Converting Signup Date ---")
df['Signup Date'] = pd.to_datetime(df['Signup Date'], errors='coerce')
print(df['Signup Date'])

# ---------------------------
# 6. Summary of Cleaned Data
# ---------------------------
print("\n--- Cleaned DataFrame ---")
print(df)

# Save cleaned data
df.to_csv("cleaned_data.csv", index=False)



Original DataFrame:
   ID     Name   Age         City Signup Date  Score
0   1    Alice  25.0     New York  2021-01-01     85
1   2      Bob  30.0     new york  01/02/2021     90
2   2      Bob  30.0     new york  01/02/2021     90
3   4  Charlie  22.0  Los Angeles  2021-03-04     88
4   5    David   NaN      Chicago  03-15-2021   9999
5   6      Eve  29.0         None  2021/04/01     87
6   7     None  28.0       Boston                 89
7   8    Grace  27.0       Boston  2021-05-01     88

--- Missing Data ---
ID             0
Name           1
Age            1
City           1
Signup Date    0
Score          0
dtype: int64

--- Duplicates ---
0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

--- Before Standardizing City ---
City
Boston         2
New York       1
new york       1
Los Angeles    1
Chicago        1
Unknown        1
Name: count, dtype: int64

--- After Standardizing City ---
City
new york       2
boston         2
los a

In [4]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
    
    

In [5]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
    
    
    # Ques_2.ipynb
# Module 5: Introduction to Data Preprocessing & Cleaning
# Task: Types of Data Issues

import pandas as pd
import numpy as np

# Sample dataset with various data issues
data = {
    'ID': [1, 2, 2, 4, 5, 6, 7, 8],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eve', None, 'Grace'],
    'Age': [25, 30, 30, 22, None, 29, 28, 27],
    'City': ['New York', 'new york', 'new york', 'Los Angeles', 'Chicago', None, 'Boston', 'Boston'],
    'Signup Date': ['2021-01-01', '01/02/2021', '01/02/2021', '2021-03-04', '03-15-2021', '2021/04/01', '', '2021-05-01'],
    'Score': [85, 90, 90, 88, 9999, 87, 89, 88]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# ---------------------------
# 1. Missing Data
# ---------------------------
print("\n--- Missing Data ---")
print(df.isnull().sum())

# Fill missing Age with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing City with 'Unknown'
df['City'].fillna('Unknown', inplace=True)

# Fill missing Name with 'NoName'
df['Name'].fillna('NoName', inplace=True)

# ---------------------------
# 2. Duplicate Data
# ---------------------------
print("\n--- Duplicates ---")
print(df.duplicated())

# Drop duplicates
df = df.drop_duplicates()

# ---------------------------
# 3. Inconsistent Data
# ---------------------------
print("\n--- Before Standardizing City ---")
print(df['City'].value_counts())

# Standardize city names
df['City'] = df['City'].str.lower().str.strip()

print("\n--- After Standardizing City ---")
print(df['City'].value_counts())

# ---------------------------
# 4. Outliers
# ---------------------------
print("\n--- Outlier Detection in Score ---")
print(df['Score'].describe())

# Replace outlier score (9999) with median
median_score = df[df['Score'] < 100]['Score'].median()
df['Score'] = df['Score'].apply(lambda x: median_score if x > 1000 else x)

# ---------------------------
# 5. Incorrect Data Types
# ---------------------------
print("\n--- Converting Signup Date ---")
df['Signup Date'] = pd.to_datetime(df['Signup Date'], errors='coerce')
print(df['Signup Date'])

# ---------------------------
# 6. Summary of Cleaned Data
# ---------------------------
print("\n--- Cleaned DataFrame ---")
print(df)

# Save cleaned data
df.to_csv("cleaned_data.csv", index=False)


Original DataFrame:
   ID     Name   Age         City Signup Date  Score
0   1    Alice  25.0     New York  2021-01-01     85
1   2      Bob  30.0     new york  01/02/2021     90
2   2      Bob  30.0     new york  01/02/2021     90
3   4  Charlie  22.0  Los Angeles  2021-03-04     88
4   5    David   NaN      Chicago  03-15-2021   9999
5   6      Eve  29.0         None  2021/04/01     87
6   7     None  28.0       Boston                 89
7   8    Grace  27.0       Boston  2021-05-01     88

--- Missing Data ---
ID             0
Name           1
Age            1
City           1
Signup Date    0
Score          0
dtype: int64

--- Duplicates ---
0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

--- Before Standardizing City ---
City
Boston         2
New York       1
new york       1
Los Angeles    1
Chicago        1
Unknown        1
Name: count, dtype: int64

--- After Standardizing City ---
City
new york       2
boston         2
los a

In [6]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
    
    