In [26]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# dark theme
px.defaults.template = "plotly_dark"
pio.renderers.default = "jupyterlab"

In [27]:
df = pd.read_csv("data.csv")

In [28]:
# Columns in df are:
"""['Timestamp', 'معدلك', 'خبرتك بالأشهر', 'شغلك له علاقة بتخصصك؟',
       'مسماك الوظيفي', 'نوع جهتك', 'جهتك وهل تنصح فيها  (إختياري) ',
       'راتبك لإقرب الف (فقط الرقم)', 'تفكر تطلع من جهتك',
       'لا تنسى دعاء الريح  ❤️'"""

# Rename columns
df.columns = ["timestamp", "gpa", "experience", "related_to_major", "job_title", "company_type", "company_name", "salary", "thinking_to_leave", "duaa"]

In [29]:
df["job_title"] = df["job_title"].replace(["Software Engineer", "software engineer ", "Software Engineer ", "مهندس برمجيات", "Java developer ", "مطور", "Software Developer", "مبرمج", "برنامج خريجين", "مطور", "Aa"], "Software Engineer")
df["job_title"] = df["job_title"].replace(["Data Scientist ", "Data Scientist", "محلل بيانات"], "Data Scientist")
df["job_title"] = df["job_title"].replace(["QA", "Software Quality Engineer"], "QA")
df["job_title"] = df["job_title"].replace(["Business Analyst", "BA"], "Business Analyst")
df["job_title"] = df["job_title"].replace(["Product specialist "], "Product")
df["job_title"] = df["job_title"].replace(["أخصائي حوكمة ومخاطر والتزام", "مسؤول امن المعلومات"], "Other")

In [30]:
# Describe mean and std of gpa and salary, no extra shit
df[["gpa", "salary"]].aggregate(["mean", "median", "std"])

Unnamed: 0,gpa,salary
mean,4.408095,12.619048
median,4.42,13.0
std,0.358394,2.459191


- Generally new grads earn about 13k
- Generally new grads have a gpa of 4.4
- The deviation in gpa is small, the deviation in salary however is extreme which suggests gpa is not the only factor in determining salary

In [31]:
# Correlation between gpa and salary
correlation_gpa_salary = df[["gpa", "salary"]].corr()["salary"]["gpa"]
print(f"Correlation between gpa and salary: {correlation_gpa_salary}")

Correlation between gpa and salary: 0.3644804300956211


This proves the point earlier mentioned. There is a positive correlation between gpa and salary, which means that the higher the gpa, the higher the salary
However, the correlation is not that strong, which means that there are other factors that affect the salary of a new grad

In [32]:
# Visualize salary and job_title plotly
fig = px.violin(df, x="job_title", y="salary", points="all", color_discrete_sequence=["#1f77b4"])
fig.update_layout(title="Salary by Job Title", yaxis_title="Salary", xaxis_title="Job Title")
fig.show()

Generally Data scientists earn the most, followed by software engineers and then other roles. This is expected as data scientists usually fill niche roles and are in high demand. QA engineers earn the least

If I'd pick a job based on salary, I'd pick data scientist. However, when considering the other factors such as availability of jobs, I'd pick software engineer as it's the most common and has a high salary

In [33]:
# Correlation of company_type to salary
fig = px.violin(df, x="company_type", y="salary", points="all", color_discrete_sequence=["lightskyblue"])
fig.update_layout(title="Salary by Company type", yaxis_title="Salary", xaxis_title="Company type")
fig.show()

- Half govermental companies are paying the most for new grads
- Private companies salary distirbution is more spread out but competes with the half govermental companies
- There aren't enough data points to make a conclusion about govermental jobs

In [34]:
# GPA to company type
fig = px.violin(df, x="company_type", y="gpa", points="all", color_discrete_sequence=["lightskyblue"])
fig.update_layout(title="GPA by Company type", yaxis_title="GPA", xaxis_title="Company type")
# Make it static
fig.show()

There isn't a clear relationship between GPA and company type. All company types have a similar distribution of GPA

In [35]:
pd.crosstab(index=[df["company_type"]], columns=df["thinking_to_leave"]).style.background_gradient(cmap='Blues', axis=1)

thinking_to_leave,لا,ممكن,نعم
company_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
حكومي,1,0,1
خاص,4,6,1
شبه حكومي,5,1,2


Usually people are satisfied with half govermental companies and don't want to leave, followed by private companies and then govermental companies

In [36]:
pd.crosstab(index=[df["experience"]], columns=df["thinking_to_leave"]).style.background_gradient(cmap='Blues', axis=1)


thinking_to_leave,لا,ممكن,نعم
experience,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
أقل من 6 اشهر,1,5,1
بين 6 اشهر وسنه,7,1,3
بين سنه وسنه ونصف,2,1,0


After the first 6 months of working, people are generally satisfied with their jobs and don't want to leave

In [37]:
pd.crosstab(index=[df["job_title"]], columns=df["thinking_to_leave"]).style.background_gradient(cmap='Blues', axis=1)

thinking_to_leave,لا,ممكن,نعم
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Business Analyst,1,1,0
Data Scientist,2,0,1
Other,2,0,0
Product,0,1,0
QA,0,2,0
Software Engineer,5,3,3


No job has a clear relationship with job satisfaction. Every job has a somewhat similar distribution of job satisfaction