In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# Import the pandas library and use the 'read_csv' function to read data from a CSV file.
# 'survey_results_public.csv' is the name of the CSV file containing survey data.
# The data is read and stored in a pandas DataFrame called 'df'.

df = pd.read_csv('survey_results_public.csv')


In [3]:
# Using the 'loc' function of pandas to extract specific rows and columns from the DataFrame 'df'.
# Rows from index 0 to 4 (inclusive) and columns 'MainBranch', 'Age', 'Employment', and 'Country' are selected.
# The extracted data is stored in a new DataFrame called 'desired_data'.

desired_data = df.loc[:4, ['MainBranch', 'Age', 'Employment', 'Country']]
desired_data


Unnamed: 0,MainBranch,Age,Employment,Country
0,None of these,18-24 years old,,
1,I am a developer by profession,25-34 years old,"Employed, full-time",United States of America
2,I am a developer by profession,45-54 years old,"Employed, full-time",United States of America
3,I am a developer by profession,25-34 years old,"Employed, full-time",United States of America
4,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Philippines


In [4]:
# Set the 'ResponseId' column as the index of the DataFrame 'df' in-place.
# This means 'ResponseId' will become the new index for the DataFrame.

df.set_index('ResponseId', inplace=True)


In [5]:
# Create filters to select specific rows based on conditions.
# 'age_filter' filters rows where 'Age' column has the value '45-54 years old'.
# 'main_branch_filter' filters rows where 'MainBranch' column has the value 'I am a developer by profession'.
# 'employment_filter' filters rows where 'Employment' column has the value 'Employed, full-time'.

age_filter = df['Age'] == '45-54 years old'
main_branch_filter = df['MainBranch'] == 'I am a developer by profession'
employment_filter = df['Employment'] == 'Employed, full-time'

# Apply filters using logical AND operation to get filtered data.
# 'filtered_data' contains rows that satisfy all three conditions.

filtered_data = df[age_filter & employment_filter & main_branch_filter]


In [6]:
# Sort the DataFrame 'df' based on two columns: 'Age' in ascending order and 'Country' in descending order.
# 'sorted_data' will contain the sorted DataFrame.

sorted_data = df.sort_values(by=['Age', 'Country'], ascending=[True, False])

sorted_data


Unnamed: 0_level_0,Q120,MainBranch,Age,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,...,Frequency_1,Frequency_2,Frequency_3,TimeSearching,TimeAnswering,ProfessionalTech,Industry,SurveyLength,SurveyEase,ConvertedCompYearly
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2710,I agree,I am learning to code,18-24 years old,"Not employed, but looking for work;Employed, p...","Hybrid (some remote, some in-person)",Bootstrapping a business;Professional developm...,"Associate degree (A.A., A.S., etc.)",Books / Physical media;Other online resources ...,Blogs with tips and tricks;Video-based Online ...,,...,,,,,,,,Appropriate in length,Easy,
3150,I agree,I am learning to code,18-24 years old,"Student, part-time;Not employed, and not looki...",,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Formal documentation provided by the owner of ...,,...,,,,,,,,Appropriate in length,Neither easy nor difficult,
6273,I agree,I am a developer by profession,18-24 years old,"Not employed, but looking for work",,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Friend or family member...,,Codecademy;Udemy,...,,,,,,,,Appropriate in length,Easy,
13925,I agree,I am learning to code,18-24 years old,"Student, full-time",,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Formal documentation provided by the owner of ...,,...,,,,,,,,Appropriate in length,Easy,
14289,I agree,I am a developer by profession,18-24 years old,"Independent contractor, freelancer, or self-em...",Remote,Hobby;Contribute to open-source projects;Schoo...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;Other online ...,Formal documentation provided by the owner of ...,,...,,,,,,,,Appropriate in length,Neither easy nor difficult,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87981,I agree,None of these,Under 18 years old,,,,,,,,...,,,,,,,,,,
88133,I agree,None of these,Under 18 years old,,,,,,,,...,,,,,,,,,,
88239,I agree,None of these,Under 18 years old,,,,,,,,...,,,,,,,,,,
88610,I agree,None of these,Under 18 years old,,,,,,,,...,,,,,,,,,,


In [7]:
# Define a function 'extract_age' that takes an age string as input and extracts the numerical age using regular expression.
# If the age is successfully extracted, return it as an integer. Otherwise, return None.
from typing import Optional
import re

def extract_age(age_string) -> Optional[int]:
    age_match = re.search(r'\d+', age_string)
    if age_match:
        return int(age_match.group())
    return None

# Apply the 'extract_age' function to the 'Age' column of the DataFrame 'df' and replace it with the extracted numerical age.
df['Age'] = df['Age'].apply(extract_age)

# Drop rows with missing age values.
df = df.dropna(subset=['Age'])

# Calculate statistical measures for the 'Age' column.
mean_age = df['Age'].mean()
std_dev_age = df['Age'].std()
median_age = df['Age'].median()
min_age = df['Age'].min()
max_age = df['Age'].max()

# Print the calculated statistics.
print(mean_age)
print(std_dev_age)
print(median_age)
print(min_age)
print(max_age)


29.12674818279146
10.55353168078226
25.0
18.0
65.0
