# Removing Duplicates

### Install the Required Libraries

In [None]:
!pip install pandas

### Step 1: Import Required Libraries

In [None]:
import pandas as pd

### Step 2: Load the Dataset into a DataFrame¶

#### load the dataset using pd.read_csv()

In [None]:
# Define the URL of the dataset
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/n01PQ9pSmiRX6520flujwQ/survey-data.csv"

# Load the dataset into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows to ensure it loaded correctly
print(df.head())


### Step 3: Identifying Duplicate Rows

##### Task 1: Identify Duplicate Rows

##### 1-Count the number of duplicate rows in the dataset.
##### 2-Display the first few duplicate rows to understand their structure.


In [None]:
# Task 3.1: Count duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Task 3.2: Display the first few duplicate rows
duplicates = df[df.duplicated()]
print("First few duplicate rows:")
print(duplicates.head())

## Step 4: Removing Duplicate Rows

#### Task 2: Remove Duplicates
#### Remove duplicate rows from the dataset using the drop_duplicates() function.
#### Verify the removal by counting the number of duplicate rows after removal .

In [None]:
# Task 4.1: Remove duplicate rows
df_cleaned = df.drop_duplicates()
print("Duplicate rows removed.")

# Task 4.2: Verify that duplicates are removed
remaining_duplicates = df_cleaned.duplicated().sum()
print(f"Number of duplicate rows after removal: {remaining_duplicates}")


### Step 5: Handling Missing Values

#### Task 3: Identify and Handle Missing Values

#### Identify missing values for all columns in the dataset.
#### Choose a column with significant missing values (e.g., EdLevel) and impute with the most frequent value.

In [None]:
## Write your code here
# Check how many missing values each column has
missing_counts = df_cleaned.isnull().sum()
print("Missing values per column:\n")
print(missing_counts[missing_counts > 0])
column_to_impute = 'EdLevel'  # Change if the column name is different in your dataset

# Check the most frequent value (mode) of the column
most_frequent = df_cleaned[column_to_impute].mode()[0]
print(f"\nMost frequent value in '{column_to_impute}': {most_frequent}")

# Fill missing values in that column with the most frequent value
df_cleaned[column_to_impute] = df_cleaned[column_to_impute].fillna(most_frequent)

# Verify if missing values are handled
missing_after_impute = df_cleaned[column_to_impute].isnull().sum()
print(f"\nMissing values in '{column_to_impute}' after imputation: {missing_after_impute}")

## Step 6: Normalizing Compensation Data
#### Task 4: Normalize Compensation Data Using ConvertedCompYearly

#### Use the ConvertedCompYearly column for compensation analysis as the normalized annual compensation is already provided.
#### Check for missing values in ConvertedCompYearly and handle them if necessary.


In [None]:
## Write your code here
# Step 6.1: Check for missing values in ConvertedCompYearly
missing_comp = df_cleaned['ConvertedCompYearly'].isnull().sum()
print(f"Missing values in 'ConvertedCompYearly': {missing_comp}")
if missing_comp > 0:
    # Fill missing compensation with median (more robust than mean)
    median_comp = df_cleaned['ConvertedCompYearly'].median()
    df_cleaned['ConvertedCompYearly'] = df_cleaned['ConvertedCompYearly'].fillna(median_comp)
    print(f"Missing values filled with median: {median_comp}")

    # Confirm removal of missing values
    print("Missing values after imputation:", df_cleaned['ConvertedCompYearly'].isnull().sum())
else:
    print("No missing values in 'ConvertedCompYearly'.")

## Step 7: Summary and Next Step
#### You handled missing values by imputing the most frequent value in a chosen column.

#### You used ConvertedCompYearly for compensation normalization and handled missing values.

#### For further analysis, consider exploring other columns or visualizing the cleaned dataset.

In [None]:
🔍 Identified and Removed Duplicate Rows

Used duplicated() and drop_duplicates() to ensure only unique records remain.

⚠️ Handled Missing Values

Displayed missing values in each column.

Imputed missing values in the EdLevel column with the most frequent value (mode).

💰 Used and Cleaned Compensation Data

Focused on ConvertedCompYearly (normalized annual compensation).

Filled missing compensation values with the median for robustness.