# Removing Duplicates

In [3]:
import pandas as pd

In [72]:
file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/n01PQ9pSmiRX6520flujwQ/survey-data.csv"
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


In [54]:
duplicates = df[df.duplicated()]
len(duplicates)

0

In [55]:
if len(duplicates) > 0:
    df = df.drop_duplicates()

## Identify and Handle Missing Values

In [56]:
missing_values = df.isnull().sum()
print(missing_values)

ResponseId                 0
MainBranch                 0
Age                        0
Employment                 0
RemoteWork             10631
                       ...  
JobSatPoints_11        35992
SurveyLength            9255
SurveyEase              9199
ConvertedCompYearly    42002
JobSat                 36311
Length: 114, dtype: int64


In [57]:
missing_values = missing_values[missing_values > 0]
# Display columns with missing values
print(missing_values)

RemoteWork             10631
CodingActivities       10971
EdLevel                 4653
LearnCode               4949
LearnCodeOnline        16200
                       ...  
JobSatPoints_11        35992
SurveyLength            9255
SurveyEase              9199
ConvertedCompYearly    42002
JobSat                 36311
Length: 109, dtype: int64


In [58]:
# Sort and display the top 10 columns with missing values
top_10_missing = missing_values.sort_values(ascending=False).head(10)

print("Top 10 columns with missing values:")
print(top_10_missing)

Top 10 columns with missing values:
AINextMuch less integrated       64289
AINextLess integrated            63082
AINextNo change                  52939
AINextMuch more integrated       51999
EmbeddedAdmired                  48704
EmbeddedWantToWorkWith           47837
EmbeddedHaveWorkedWith           43223
ConvertedCompYearly              42002
AIToolNot interested in Using    41023
AINextMore integrated            41009
dtype: int64


In [59]:
# Calculate the percentage of missing values
missing_percentage = (df.isnull().sum() / len(df)) * 100
top_10_missing_percentage = missing_percentage.sort_values(ascending=False).head(10)
print(top_10_missing_percentage)

AINextMuch less integrated       98.245641
AINextLess integrated            96.401119
AINextNo change                  80.900714
AINextMuch more integrated       79.464217
EmbeddedAdmired                  74.428840
EmbeddedWantToWorkWith           73.103901
EmbeddedHaveWorkedWith           66.052845
ConvertedCompYearly              64.186928
AIToolNot interested in Using    62.690832
AINextMore integrated            62.669438
dtype: float64


## Choose a column with significant missing values (e.g., EdLevel) and impute with the most frequent value.

In [60]:
missing_ed_values = df['EdLevel'].isnull().sum()
print(missing_ed_values)

4653


###### Calculate the mode of EdLevel

In [73]:
eds_freq_values = df['EdLevel'].mode()[0]
print(eds_freq_values)

Bachelor’s degree (B.A., B.S., B.Eng., etc.)


In [74]:
missing_ed_values1 = df['EdLevel'].isnull().sum()
df['EdLevel'].value_counts()

EdLevel
Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          24942
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       15557
Some college/university study without earning a degree                                 7651
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     5793
Professional degree (JD, MD, Ph.D, Ed.D, etc.)                                         2970
Associate degree (A.A., A.S., etc.)                                                    1793
Primary/elementary school                                                              1146
Something else                                                                          932
Name: count, dtype: int64

In [75]:
missing_ed_values2 = df['EdLevel'].isnull().sum()
print(missing_ed_values2)

4653


## Step 6: Normalizing Compensation Data

* Task 4: Normalize Compensation Data Using ConvertedCompYearly

* Use the ConvertedCompYearly column for compensation analysis as the normalized annual compensation is already provided.
Check for missing values in ConvertedCompYearly and handle them if necessary.

In [76]:
missing_comp = df['ConvertedCompYearly'].isnull().sum()
print(missing_comp)

42002


In [77]:
feq_comp = df['ConvertedCompYearly'].mode()[0]
print(feq_comp)

64444.0


In [78]:
df['ConvertedCompYearly'] = df['ConvertedCompYearly'].fillna(feq_comp)

In [80]:
# Verify all available data have counts
df['ConvertedCompYearly'].value_counts()

ConvertedCompYearly
64444.0     42323
53703.0       308
75184.0       230
85925.0       226
107406.0      208
            ...  
46145.0         1
154440.0        1
27391.0         1
11562.0         1
116844.0        1
Name: count, Length: 6113, dtype: int64

In [83]:
# Checking any empty value remain
duplicate_check = df['ConvertedCompYearly'].isnull().sum()
print(duplicate_check)

0
