 # Exploratory Data Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/n01PQ9pSmiRX6520flujwQ/survey-data.csv'
df = pd.read_csv(data_url)

In [3]:
# Set pandas option to display all columns
pd.set_option('display.max_rows', None)

In [4]:
# Display first few row of dataframe
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


# Step 3: Handling Missing Data
### Identify and manage missing values in critical columns such as Employment, JobSat, and RemoteWork. Implement a strategy to fill or drop these values, depending on the significance of the missing data.

In [6]:
# Handle missing value
print(df.shape)

(65437, 114)


In [18]:
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts>0].sort_values(ascending=False))

AINextMuch less integrated        64289
AINextLess integrated             63082
AINextNo change                   52939
AINextMuch more integrated        51999
EmbeddedAdmired                   48704
EmbeddedWantToWorkWith            47837
EmbeddedHaveWorkedWith            43223
ConvertedCompYearly               42002
AIToolNot interested in Using     41023
AINextMore integrated             41009
Knowledge_9                       37802
Frequency_3                       37727
Knowledge_8                       37679
ProfessionalTech                  37673
Knowledge_7                       37659
Knowledge_6                       37573
Knowledge_5                       37557
Knowledge_2                       37416
Knowledge_4                       37407
Knowledge_3                       37342
Frustration                       37186
Frequency_2                       37073
Frequency_1                       37068
ProfessionalCloud                 36946
Knowledge_1                       36773


In [19]:
# Calculate the threshold for 50% missing data
threshold = df.shape[0] * 0.20  # No need for int casting, it's not necessary
print(threshold)

# Drop columns with more than 50% missing values
df = df.dropna(thresh=threshold, axis=1)

32718.5


In [20]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,AIBen,AIAcc,AIComplex,AIToolCurrently Using,AIThreat,AIEthics,AIChallenges,TBranch,SurveyLength,SurveyEase
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,Increase productivity,,,,,,,No,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,,Yes,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,,No,Appropriate in length,Easy
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,Increase productivity;Greater efficiency;Impro...,Somewhat trust,Bad at handling complex tasks,Learning about a codebase;Project planning;Wri...,No,Circulating misinformation or disinformation;M...,Don’t trust the output or answers,,Too long,Easy
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,,,Too short,Easy


In [21]:
print(df.shape)

(65437, 70)


In [22]:
# Again check the missing value
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts>0].sort_values(ascending=False))

MiscTechWantToWorkWith            32473
CompTotal                         31697
PlatformWantToWorkWith            30905
WebframeAdmired                   30494
AIToolCurrently Using             30365
AISearchDevAdmired                29894
AISearchDevWantToWorkWith         28736
AIBen                             28543
AIComplex                         28416
OfficeStackAsyncAdmired           28233
AIAcc                             28135
AIChallenges                      27906
WebframeWantToWorkWith            26902
DatabaseAdmired                   26880
OfficeStackAsyncWantToWorkWith    26471
MiscTechHaveWorkedWith            25994
TechDoc                           24540
AIEthics                          23889
PlatformHaveWorkedWith            23071
DatabaseWantToWorkWith            22879
BuildvsBuy                        22079
TechEndorse                       21769
ToolsTechAdmired                  21440
AISearchDevHaveWorkedWith         20984
TBranch                           20960


In [34]:
# Fill Categorical data with mode
for col in df.select_dtypes(include='object'):
    print(col.value_counts())

AttributeError: 'str' object has no attribute 'value_counts'

In [1]:
df.tail(10)

NameError: name 'df' is not defined