# Bioinformatics survey - skills analysis

In [1]:
from typing import Dict, Set

import pandas as pd 
import numpy as np

import sys
import os

## Define path to data and constant variables

In [2]:
# paths
data_path = "./data"
# constant variables
SURVEY_CSV = "BioinformaticsSurvey2022.csv"

## Define functions used throughout the analysis

In [3]:
def compute_average_score(
    df: pd.DataFrame, elements: Set, term: str
) -> Dict[str, Dict[str, float]]:
    assert isinstance(df, pd.DataFrame)
    assert isinstance(elements, set)
    assert isinstance(term, str)
    score_dict = {}
    for e in elements:
        subset_e = df[df[term] == e]
        cols = subset_e.columns.tolist()[2:]
        score_dict[e] = {}
        for col in cols:
            avg_score = np.mean(
                [
                    val 
                    for val in subset_e[col].tolist() 
                    if str(val) != "nan" and not isinstance(val, str)
                ]
            )
            score_dict[e][col] = avg_score
    return score_dict

## Start the analysis

In [4]:
survey = pd.read_csv(os.path.join(data_path, SURVEY_CSV))
survey.head()

Unnamed: 0,Timestamp,How old are you?,What are your pronouns?,Where are you from?,Where in Italy are you studying/did you study?,Are you an off-site student?,What is your current position?,Is bioinformatics the main focus of your studies?,What is your current degree area? Please select the closest answer that applies.,Have you ever heard about bioinformatics?,...,"On a scale of 1 (very improbable) to 5 (most likely), where would you like to work when you finish your training as a bioinformatician? [Science Communication]","On a scale of 1 (very improbable) to 5 (most likely), where would you like to work when you finish your training as a bioinformatician? [Teaching]","On a scale of 1 (very improbable) to 5 (most likely), where would you like to work when you finish your training as a bioinformatician? [Entrepreneurship]","On a scale of 1 (very improbable) to 5 (most likely), where would you like to work when you finish your training as a bioinformatician? [Editorial and publishing services]",Where in the world do you see yourself working? Select one or multiple answers.,Do you think that a mentoring program could help you in better design your career path?,How did you come across this survey?,Had you heard about ISCB before this survey?,Had you heard about RSG-Italy before this survey?,Your comments/advice:
0,2022/03/24 4:42:29 pm CET,23-26,He/him,"Northern-West Italy (Valle d'Aosta, Liguria, L...","Northern-West Italy (Valle d'Aosta, Liguria, L...",No,Master's student,Yes,,,...,1.0,1.0,1.0,1.0,Italy;Europe;North America,"Yes, I feel confused about job opportunities",RSG-Italy Telegram channel,Yes,Yes,
1,2022/03/24 4:44:54 pm CET,23-26,He/him,"Northern-East Italy (Trentino-Alto Adige, Vene...","Northern-East Italy (Trentino-Alto Adige, Vene...",Yes,Master's student,No,Computer Science,"Yes, I'm familiar with the term",...,,,,,,,RSG-Italy Telegram channel,No,Yes,"In my opinion, there should be more bachelors ..."
2,2022/03/24 4:52:42 pm CET,23-26,He/him,"Northern-West Italy (Valle d'Aosta, Liguria, L...","Northern-East Italy (Trentino-Alto Adige, Vene...",No,PhD student (either in academia or industry),Yes,,,...,4.0,3.0,3.0,3.0,Italy;Europe;North America,"No, I think I will figure it out as I gain exp...",From friends/colleagues,Yes,Yes,
3,2022/03/24 4:54:39 pm CET,23-26,She/her,"Southern Italy (Abruzzo, Molise, Campania, Pug...","Northern-East Italy (Trentino-Alto Adige, Vene...",Yes,Master's student,Yes,,,...,1.0,2.0,2.0,1.0,Italy;Europe;North America;Oceania,"Yes, I feel confused about job opportunities",RSG-Italy Telegram channel,Yes,Yes,
4,2022/03/24 5:08:20 pm CET,27-30,He/him,"Northern-East Italy (Trentino-Alto Adige, Vene...","Northern-East Italy (Trentino-Alto Adige, Vene...",No,PhD student (either in academia or industry),Yes,,,...,1.0,1.0,3.0,1.0,Europe;North America,"No, I think I will figure it out as I gain exp...",RSG-Italy Telegram channel,Yes,Yes,


In [5]:
# subset the survey to consider only skills data
survey_skills = survey.iloc[:, [25, 26] + list(range(36, 62))]
# rename skill columns
cols = survey_skills.columns.tolist()
skill_cols = cols[2:]
columns = [col.split("[")[1].replace("]", "") for col in skill_cols]
survey_skills.columns = ["University", "Degree class"] + columns
survey_skills.head()


Unnamed: 0,University,Degree class,"Bioinformatics algorithms (Genome assembly, alignment, variant calling, …)",Analysis of genomics data,Analysis of transcriptomics data,Analysis of proteomics data,Analysis of metabolomics data,Structural bioinformatics,Database development,Microscopy and imaging data analysis,...,SQL/NoSQL database development,High Performance Computing cluster usage,LINUX environment usage,"Version control systems usage (GitHub, GitLab, …)","Workflow languages (Nextflow, Snakemake, WDL, …)","Containers usage (Docker, Singularity, …)","Package management systems (Conda, …)","IDE usage (Visual Studio Code, Pycharm, RStudio, …)",Statistical/Benchmarking analyses,Data FAIRness
0,University of Milano La Statale,"LM-8 Biotecnologie industriali (Trento, Milano)","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet",5.0,...,"I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet"
1,,,,,,,,,,,...,,,,,,,,,,
2,University of Verona,Other,3,4,2,4,1,2,3,2.0,...,3,4,5,2,1,2,2,1,3,2
3,University of Trento,"LM-8 Biotecnologie industriali (Trento, Milano)",5,5,4,4,3,4,1,2.0,...,"I do not know, I have not finished my studies yet",1,2,1,1,1,1,5,3,1
4,University of Milano La Statale,L-02 Biotecnologie (Rome La Sapienza),2,2,2,2,2,2,2,2.0,...,2,3,3,2,2,2,2,2,2,2


In [6]:
# remove rows with all nan values
survey_skills.dropna(axis=0, how="all", inplace=True)
survey_skills.reset_index(drop=True, inplace=True)
survey_skills.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,University,Degree class,"Bioinformatics algorithms (Genome assembly, alignment, variant calling, …)",Analysis of genomics data,Analysis of transcriptomics data,Analysis of proteomics data,Analysis of metabolomics data,Structural bioinformatics,Database development,Microscopy and imaging data analysis,...,SQL/NoSQL database development,High Performance Computing cluster usage,LINUX environment usage,"Version control systems usage (GitHub, GitLab, …)","Workflow languages (Nextflow, Snakemake, WDL, …)","Containers usage (Docker, Singularity, …)","Package management systems (Conda, …)","IDE usage (Visual Studio Code, Pycharm, RStudio, …)",Statistical/Benchmarking analyses,Data FAIRness
0,University of Milano La Statale,"LM-8 Biotecnologie industriali (Trento, Milano)","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet","do not know, I have not finished my studies yet",5,...,"I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet","I do not know, I have not finished my studies yet"
1,University of Verona,Other,3,4,2,4,1,2,3,2,...,3,4,5,2,1,2,2,1,3,2
2,University of Trento,"LM-8 Biotecnologie industriali (Trento, Milano)",5,5,4,4,3,4,1,2,...,"I do not know, I have not finished my studies yet",1,2,1,1,1,1,5,3,1
3,University of Milano La Statale,L-02 Biotecnologie (Rome La Sapienza),2,2,2,2,2,2,2,2,...,2,3,3,2,2,2,2,2,2,2
4,University of Trento,"LM-8 Biotecnologie industriali (Trento, Milano)",4,3,2,1,1,3,2,1,...,1,2,4,2,1,1,4,2,2,1


In [None]:
# change 

In [7]:
# compute average score per affiliation
universities = set(survey_skills["University"].tolist())
compute_average_score(survey_skills, universities, "University")

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'University of Milano La Statale': {'Bioinformatics algorithms (Genome assembly, alignment, variant calling, …)': nan,
  'Analysis of genomics data': nan,
  'Analysis of transcriptomics data': nan,
  'Analysis of proteomics data': nan,
  'Analysis of metabolomics data': nan,
  'Structural bioinformatics': nan,
  'Database development': nan,
  'Microscopy and imaging data analysis': nan,
  'Network & systems biology': nan,
  'Phylogenetics': nan,
  'Drug design & development': nan,
  'Mathematical modelling': nan,
  'Software & tools development': nan,
  'Statistics and probability': nan,
  'Artificial intelligence (machine learning and deep learning)': nan,
  'Programming languages (R, Bash, Python, C++, …)': nan,
  'SQL/NoSQL database development': nan,
  'High Performance Computing cluster usage': nan,
  'LINUX environment usage': nan,
  'Version control systems usage (GitHub, GitLab, …)': nan,
  'Workflow languages (Nextflow, Snakemake, WDL, …)': nan,
  'Containers usage (Docker, S