In [None]:
# Importing the needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading the csv file containing the dataset
df = pd.read_csv('kaggle_survey_2021_responses.csv')

In [None]:
df.head()

In [None]:
df.shape

# Data Cleaning (Preprocessing)

In [None]:
# Removing the first row containing the questions from the DataFrame
questions = df.iloc[0,:]
df.drop(index = 0, inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Covert the datatype of the numerical values from Object to Numeric (Integer or Float ...)
for column in df.columns:
    if df[column].str.isnumeric().all():
        df[column] = pd.to_numeric(df[column])

In [None]:
df.info()

In [None]:
df['Q3'].value_counts()

In [None]:
df['Q3'].unique()

In [None]:
# Choose some countries to analyze their data
Countries = ['Australia', 'Japan', 'Singapore', 'China', 'United States of America', 'Italy', 'South Africa', 'Spain', 'United Kingdom of Great Britain and Northern Ireland', 'France', 'Switzerland', 'Canada', 'Hong Kong (S.A.R.)', 'Germany', 'South Korea']

In [None]:
len(Countries)

In [None]:
# Update the DataFrame to contain the chosen countries only
df = df[df['Q3'].isin(Countries)]

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

# Data Analysis

In [None]:
# Analyzing the Distribution of the Ages
Age = df['Q1'].value_counts().sort_index()
Age

In [None]:
# Plotting the Distribution of the ages using Line Chart
plt.figure(figsize = (10,6))
plt.bar(Age.index, Age.values)
plt.title('Age Distribution', size = 25)
plt.xlabel('Age Range', size = 20)
plt.ylabel('Frequency', size = 20)
plt.xticks(rotation = 90, size = 16)
plt.yticks(size = 16)
plt.show()

* It's clear that learners in the age range of 25-29 years old participated more than any other age's range.
* We can notice that starting from the age of 30, the number of participants gradually decreases, maybe because they don't have time
* From the age of 18 until 29, number of participants is increasing, It's a good sign for the awareness of the next generation

In [None]:
def func(pct, allvalues):
    absolute = int(pct / 100.*np.sum(allvalues))
    return "{:.1f}%\n({:d})".format(pct, absolute)

In [None]:
# Analyzing the Distribution of the Genders
Gender = df['Q2'].value_counts().sort_values(ascending=False)
Gender

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16,8))
fig.suptitle('Distribution of Satisfaction Level on Departure and Arrival Time Convenience', size = 25)

Gender = df['Q2'].value_counts()

axes[0].bar(Gender.index, Gender.values)
axes[0].set_xlabel('Gender', size = 22)
axes[0].set_ylabel('Frequancy', size = 22)
axes[0].tick_params(labelrotation = 60, labelsize = 16)
axes[0].spines['bottom'].set_color('black')
axes[0].spines['left'].set_color('black')
axes[0].spines['top'].set_color('black')
axes[0].spines['right'].set_color('black')

Gender = pd.Series({'Man': 6388, 'Woman': 1395, 'Other': 220})
axes[1].pie(Gender.values, labels = Gender.index, explode=[0.05, 0.05, 0.1], shadow = True,
       autopct = lambda pct: func(pct, Gender.values),
       textprops = {'family':'sans-serif', 'fontsize':'medium', 'fontstyle':'italic', 'fontweight':'heavy'});
axes[1].set_title('Gender', fontsize = 22);


In [None]:
# Analyzing the Distribution of the Countries
Country = df['Q3'].value_counts()
Country

In [None]:
# Plotting the Distribution of the Countries using Bar Chart
plt.figure(figsize = (12,12))
plt.bar(Country.index, Country.values, color = ['#003653', '#07415f', '#104c6b', '#175776', '#1f6282', '#266e8f', '#2d7a9b',
                                        '#3586a7', '#3c93b3', '#449fbf', '#4caccb', '#54b9d7', '#5cc6e3', '#65d4ee', '#6ee1fa'])
plt.title('Country Distribution', size = 36)
plt.xlabel('Country', size = 20)
plt.ylabel('Frequency', size = 20)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Plotting the Distribution of the countries using Pie Chart
plt.figure(figsize = (20,20))
plt.pie(Country, labels = Country.index, autopct = lambda pct: func(pct, Gender.values),
        colors = ['#003653', '#07415f', '#104c6b', '#175776', '#1f6282', '#266e8f', '#2d7a9b', '#3586a7', '#3c93b3', '#449fbf',
                  '#4caccb', '#54b9d7', '#5cc6e3', '#65d4ee', '#6ee1fa'],
        explode=[0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9],
       textprops = {'family':'sans-serif', 'fontsize':'xx-large', 'fontstyle':'italic', 'fontweight':'heavy'})
plt.show()

* USA has the majority of the participants with almost third of the total participants
* Participants from USA are more than the least 10 countries combined

In [None]:
# Analyzing the Distribution of the Education Levels
Education = df['Q4'].value_counts().sort_values(ascending=False)
Education

In [None]:
# Plotting the Distribution of the Education Levels using Bar Chart
x = Education.index
y = Education.values
plt.figure(figsize = (8,8))
plt.bar(x, y)
plt.title('Education Level Distribution', size = 24)
plt.xlabel('Education Level', size = 16)
plt.ylabel('Frequency', size = 16)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Plotting the Distribution of the Education Levels using Pie Chart after Ignoring small values
plt.figure(figsize=(8,8))
plt.pie(Education[0:4], labels = Education.index[0:4], autopct = "%1.1f%%")
plt.legend(Education.index)
plt.show()

* The most of the participants are holding a Master degree, they are about the half of the total participants
* More than the quarter of the participants are holding a Bachelor degree
* Learning is for everyone, 8% of the participants didn't earn a Bachelor

In [None]:
# Analyzing the Distribution of the Job Titles
Job_Title = df['Q5'].value_counts().sort_values(ascending = False)
Job_Title

In [None]:
# Plotting the Distribution of the Titles using Bar Chart
x = Job_Title.index
y = Job_Title.values
plt.figure(figsize=(8,8))
plt.bar(x, y)
plt.title('Job Title Distribution')
plt.xlabel('Job Title')
plt.ylabel('Frequency')
plt.xticks(rotation = 90)
plt.show()

* Students participated in the survey more than any other job
* Data Scientists were in the second place

In [None]:
# Analyzing the Distribution of the Years of Experience in writing code
Experience = df['Q6'].value_counts().sort_values(ascending = False)
Experience

In [None]:
# Plotting the Distribution of the Experience Years in writing code using Bar Chart
x = Experience.index
y = Experience.values
plt.figure(figsize = (8,8))
plt.bar(x, y)
plt.title('Experience Years Distribution', size = 24)
plt.xlabel('Experience Years', size = 16)
plt.ylabel('Frequency', size = 16)
plt.xticks(rotation = 90)
plt.show()

* Learners with experience from a year to 3 years participated more than others
* People who never written code participated less than others

In [None]:
# Detect which columns are related to Question No. 7
PL = df.columns[df.columns.str.contains('^Q7')]
PL

In [None]:
# Showing a simple DataFrame containing only the columns related to Question No. 7
df[PL]

In [None]:
# Analyzing the Data to reach the Distribution of the used Programming Languages
PL_dict = {}
for column in PL:
    value_counts = df[column].value_counts()
    if not value_counts.empty:  # Check if the result of value_counts is not empty
        key = value_counts.index[0]
        PL_dict[key] = value_counts[0]
    else:
        pass
PL_dict

In [None]:
# Converting the Dictionary to a Series
PL_series = pd.Series(PL_dict).sort_values(ascending = False)
PL_series

In [None]:
# Plotting the Distribution of the used Programming Languages using Bar Chart
x = PL_series.index
y = PL_series.values
plt.figure(figsize = (8,8))
plt.bar(x, y)
plt.title('Used Programming Languages')
plt.xlabel('Programming Languages')
plt.ylabel('Frequency')
plt.xticks(rotation = 90)
plt.show()

* Python is the most used programming language according to the survey's analysis
* SQL is in the second place
* R is in the third place
* Most of the rest languages are approxematly equal in No. of users
* Julia and Swift have few users yet

In [None]:
# Plotting the Distribution of the used Programming Languages using Pie Chart
plt.figure(figsize = (14,14))
plt.pie(PL_series, labels=PL_series.index, autopct="%1.1f%%")
plt.legend(PL_series.index)
plt.show()

In [None]:
cust_color = [
    '#003653',
'#07415f',
'#104c6b',
'#175776',
'#1f6282',
'#266e8f',
'#2d7a9b',
'#3586a7',
'#3c93b3',
'#449fbf',
'#4caccb',
'#54b9d7',
'#5cc6e3',
'#65d4ee',
'#6ee1fa',
]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,8))

axes[0].bar(PL_series.index, PL_series.values, color = cust_color)
axes[0].set_xlabel('Programming Language', size=20)
axes[0].set_ylabel('Frequancy', size=20)
axes[0].patch.set_facecolor('#F2F2F2')
axes[0].tick_params(labelrotation = 90, labelsize=15)
axes[0].patch.set_facecolor('#F2F2F2')
axes[0].grid(color=cust_color[3], alpha=0.5, linestyle='--')
axes[0].spines['bottom'].set_color('black')
axes[0].spines['left'].set_color('black')
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)



axes[1].pie(PL_series, labels = PL_series.index, autopct = "%1.1f%%", colors = cust_color, explode=[.04 for i in range(len(PL_series))]);

plt.suptitle('Programming Language Distribution', size=36)
plt.show()

In [None]:
# Analyzing the Distribution of the Recommended Programming Language to learn first
Rec_PL = df['Q8'].value_counts().sort_values(ascending = False)
Rec_PL

In [None]:
# Plotting the Distribution of the Recommended Programming Language to learn first using Pie Chart
plt.figure(figsize = (14,14))
plt.pie(Rec_PL, labels = Rec_PL.index, autopct="%1.1f%%")
plt.legend(Rec_PL.index)
plt.show()

* Most participants recomended Python as a first programming language to learn
* Some participants recomended R or Sql too

In [None]:
# Plotting the Distribution of the Recommended Programming Language to learn first using Pie Chart after Ignoring small values
plt.figure(figsize = (14,14))
plt.pie(Rec_PL[0:5], labels = Rec_PL.index[0:5], autopct="%1.1f%%")
plt.legend(Rec_PL.index)
plt.show()

In [None]:
# Detect which columns are related to Question No. 9
IDE = df.columns[df.columns.str.contains('^Q9')]
IDE

In [None]:
# Showing a simple DataFrame containing only the columns related to Question No. 7
df[IDE]

In [None]:
# Analyzing the Data to reach the Distribution of the used IDE's
IDE_dict = {}
for column in IDE:
    value_counts = df[column].value_counts()
    if not value_counts.empty:  # Check if the result of value_counts is not empty
        key = value_counts.index[0]
        IDE_dict[key] = value_counts[0]
    else:
        pass
IDE_dict

In [None]:
# Converting the Dictionary to a Series
IDE_series = pd.Series(IDE_dict).sort_values(ascending = False)
IDE_series

In [None]:
# Plotting the Distribution of the used IDE's using Pie Chart
plt.figure(figsize = (10,10))
plt.pie(IDE_series, labels = IDE_series.index, autopct = "%1.1f%%")
plt.legend(IDE_series.index)
plt.show()

* Most participants use Jupyter Nootbook as an IDE with about quarter of the total participants
* Visual Studio Code is in the second place with almost 15% of the total participants

In [None]:
# Plotting the Distribution of the Top 5 used IDE's using Pie Chart
plt.figure(figsize = (10,10))
plt.pie(IDE_series[0:5], labels = IDE_series.index[0:5], autopct = "%1.1f%%")
plt.legend(IDE_series.index)
plt.show()

In [None]:
# Analyzing the Distribution of the used Computing Platform
Platform = df['Q11'].value_counts().sort_values(ascending = False)
Platform

In [None]:
# Plotting the Distribution of the used Computing Platform using Bar Chart
x = Platform.index
y = Platform.values
plt.figure(figsize = (5,5))
plt.bar(x, y)
plt.title('Computing Platform Distribution')
plt.xlabel('Computing Platform')
plt.ylabel('Frequency')
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Plotting the Distribution of the Top 4 used Computing Platform using Pie Chart
plt.figure(figsize = (10,10))
plt.pie(Platform[0:4], labels = Platform.index[0:4], autopct = "%1.1f%%")
plt.legend(Platform.index)
plt.show()

* More than half of the total participants use Laptops
* More than quarter of the total participants use a PC

In [None]:
# Detect which columns are related to Question No. 14
VL = df.columns[df.columns.str.contains('^Q14')]
VL

In [None]:
# Showing a simple DataFrame containing only the columns related to Question No. 14
df[VL]

In [None]:
# Analyzing the Data to reach the Distribution of the used Visualization Libraries
VL_dict = {}
for column in VL:
    value_counts = df[column].value_counts()
    if not value_counts.empty:  # Check if the result of value_counts is not empty
        key = value_counts.index[0]
        VL_dict[key] = value_counts[0]
    else:
        pass
VL_dict

In [None]:
# Converting the Dictionary to a Series
VL_series = pd.Series(VL_dict).sort_values(ascending = False)
VL_series

In [None]:
# Plotting the Distribution of the used Visualization Libraries using Bar Chart
x = VL_series.index
y = VL_series.values
plt.figure(figsize = (8,8))
plt.bar(x, y)
plt.title('Visualization Libraries')
plt.xlabel('The Library')
plt.ylabel('Frequency')
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Plotting the Distribution of the used Visualization Libraries using Pie Chart
plt.figure(figsize=(8,8))
plt.pie(VL_series, labels = VL_series.index, autopct = "%1.1f%%")
plt.legend(VL_series.index)
plt.show()

* About third of the participants use Matplotlib as the most used visualization library
* Seaborn is the second most used one with 22.5% of the total participants

In [None]:
# Plotting the Distribution of the Top 5 used used Visualization Libraries using Pie Chart
plt.figure(figsize=(8,8))
plt.pie(VL_series[0:5], labels = VL_series.index[0:5], autopct = "%1.1f%%")
plt.legend(VL_series.index)
plt.show()