diff --git a/docs/about.md b/docs/about.md index 1d92585..51196d5 100644 --- a/docs/about.md +++ b/docs/about.md @@ -1,9 +1,13 @@ --- +title: RAP Community of Practice + +tags: + - RAP CoP website hide: - navigation --- -# RAP Community of Practice +# > **This material is maintained by the [NHS England RAP Community of Practice](mailto:england.rapchampions@nhs.net)**. diff --git a/docs/example_RAP_CoP_page.md b/docs/example_RAP_CoP_page.md index 067101e..0857647 100644 --- a/docs/example_RAP_CoP_page.md +++ b/docs/example_RAP_CoP_page.md @@ -1,113 +1,121 @@ -# Page Template - -## Some introductory subtitle - -!!! tip "TLDR" - - **very brief** summary of the main findings - - any key links i.e. to forms or other things people need fill in - - try to keep it to just three - -??? question "Why should we care?" - - Brief summary of why this is important - - any key links of background - - We can have a bigger section on this below - -??? success "Pre-requisites" - * Some information on what someone might need to be familiar with before they can use this page - - |Pre-requisite | Importance | Note | - |--------------|------------|------| - |[Some link to some other guide we have](https://nhsdigital.github.io/rap-community-of-practice/)|Necessary/Helpful|Any comment we have on this| - |some other guide|Helpful|another note| - -!!! info inline end - XKCD comics can also be great at grabbing attention: - - ![An amusing comic about marketing](https://imgs.xkcd.com/comics/immune_system.png "An amusing comic about marketing") - -**Don't forget to update the `mkdocs.yml` file to add this page, so it appears in the nav bar!** - -Here we need some bit explaining the background of the thing the page is talking about - -- keep it brief -- make it clear what it is and what the benefit is -- don't go into details of the methods, but perhaps highlight some of the key approaches described below - -## First subtitle of the main content - -Talk about the issue break it down into steps. - -We might even include a little diagram: - -```mermaid -graph LR - A[Have an idea] --> B{Make a page}; - B -->|Pull Request| C[Colleagues Review]; - C --> D[Feedback]; - D --> B; - B ----->|Approved| E[Publish!]; -``` - -Consider linking to other pages and try and extract the general concept from language specific implementations, i.e. we could have a pager about "functions", and then link to specific pages on how to do functions in Python and R. - -Also, have a look on the following pages to see if they have guidance we could link to, or adapt -- [Quality Assurance of Code for Analytics](https://best-practice-and-impact.github.io/qa-of-code-guidance/intro.html) -- [Turing Way](https://the-turing-way.netlify.app/welcome.html) -- [Central RAP Guidance from GSS](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/) - -## General subsection template - -Write some content here - -### General subsubsection template - -This might be on some specific aspect of the subsection - -#### General subsubsubsection template - -We might need to get even more specific, but probably wont use this as much! - -## Further subtitles - -You can include code snippets `inline` or in blocks: - -```Python - print("hello world") -``` - -You might also want to hide large code snippets: - -??? example "Some big code snippet" - ```Python - print("HA, I lied, it's only a small code snippet") - ``` - -!!! note - Admonition blocks can be helpful to bring out key points - - See [mkdocs guidance]() - -## Further subsections - -Continue to work through the subject, but we don't have to make the pages long - a short page can be just as useful! - -You can include pictures, however referencing them requires a few steps back in the directory tree (see below): - -![image-alt-text](images/add_file.PNG "Some random picture") - -You can also have tabs: - -=== "Tab 1" - We can put whatever we want in here - ```Python - def somefunc(a): - return None - ``` - -=== "Tab 2" - And in here something completely different, such as a diagram - ![alt text](images/branch_info.JPG "Some random picture") - -## Further Reading - -- Provide any useful links people might need to further their learning +--- +title: Add your title here + +# Uncomment and add your tags here +# tags: +# - +--- + +# + +## Some introductory subtitle + +!!! tip "TLDR" + - **very brief** summary of the main findings + - any key links i.e. to forms or other things people need fill in + - try to keep it to just three + +??? question "Why should we care?" + - Brief summary of why this is important + - any key links of background + - We can have a bigger section on this below + +??? success "Pre-requisites" + * Some information on what someone might need to be familiar with before they can use this page + + |Pre-requisite | Importance | Note | + |--------------|------------|------| + |[Some link to some other guide we have](https://nhsdigital.github.io/rap-community-of-practice/)|Necessary/Helpful|Any comment we have on this| + |some other guide|Helpful|another note| + +!!! info inline end + XKCD comics can also be great at grabbing attention: + + ![An amusing comic about marketing](https://imgs.xkcd.com/comics/immune_system.png "An amusing comic about marketing") + +**Don't forget to update the `mkdocs.yml` file to add this page, so it appears in the nav bar!** + +Here we need some bit explaining the background of the thing the page is talking about + +- keep it brief +- make it clear what it is and what the benefit is +- don't go into details of the methods, but perhaps highlight some of the key approaches described below + +## First subtitle of the main content + +Talk about the issue break it down into steps. + +We might even include a little diagram: + +```mermaid +graph LR + A[Have an idea] --> B{Make a page}; + B -->|Pull Request| C[Colleagues Review]; + C --> D[Feedback]; + D --> B; + B ----->|Approved| E[Publish!]; +``` + +Consider linking to other pages and try and extract the general concept from language specific implementations, i.e. we could have a pager about "functions", and then link to specific pages on how to do functions in Python and R. + +Also, have a look on the following pages to see if they have guidance we could link to, or adapt +- [Quality Assurance of Code for Analytics](https://best-practice-and-impact.github.io/qa-of-code-guidance/intro.html) +- [Turing Way](https://the-turing-way.netlify.app/welcome.html) +- [Central RAP Guidance from GSS](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/) + +## General subsection template + +Write some content here + +### General subsubsection template + +This might be on some specific aspect of the subsection + +#### General subsubsubsection template + +We might need to get even more specific, but probably wont use this as much! + +## Further subtitles + +You can include code snippets `inline` or in blocks: + +```Python + print("hello world") +``` + +You might also want to hide large code snippets: + +??? example "Some big code snippet" + ```Python + print("HA, I lied, it's only a small code snippet") + ``` + +!!! note + Admonition blocks can be helpful to bring out key points + + See [mkdocs guidance]() + +## Further subsections + +Continue to work through the subject, but we don't have to make the pages long - a short page can be just as useful! + +You can include pictures, however referencing them requires a few steps back in the directory tree (see below): + +![image-alt-text](images/add_file.PNG "Some random picture") + +You can also have tabs: + +=== "Tab 1" + We can put whatever we want in here + ```Python + def somefunc(a): + return None + ``` + +=== "Tab 2" + And in here something completely different, such as a diagram + ![alt text](images/branch_info.JPG "Some random picture") + +## Further Reading + +- Provide any useful links people might need to further their learning diff --git a/docs/glossary.md b/docs/glossary.md index 4fe3b9a..d9a7f7a 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -1,25 +1,12 @@ --- +title: Glossary +tags: + - Glossary hide: - navigation --- -## RAP - -RAP stands for Reproducible Analytical Pipelines. The term comes from UK public sector data scientists - you can [read the ONS description here](https://datasciencecampus.ons.gov.uk/capability/data-science-campus-faculty/reproducible-analytical-pipeline-journey/#:~:text=Reproducible%20Analytical%20Pipelines%20are%20programs,impressive%20efficiencies%20in%20your%20teams.). We also have a page on [why RAP is important](introduction_to_RAP/why_RAP_is_important.md) - -## Pipeline - -A data pipeline is a series of steps or processes that are used to collect, process, and transform data from various sources into a format that can be easily used. The pipeline typically includes data ingestion, data cleaning, and some analysis. A good pipeline can automate work as well as improving the quality of the outcomes. - -## Virtual Environments - -Virtual environments are a way to ensure that you have maximum control over the code that you're writing and how it will run. Many software packages interact with one another, and not always in a good or predictable way. Virtual environments allow you to develop code like it is being done on a completely clean, separate machine. In a virtual environment, you can install whatever packages you want, at whatever version, without worrying about affecting the other software or projects you might have on your computer. - -Best practice recommends that we create a different virtual environment for each project that we work on. - -## Venv - -`venv` is a particular package for managing virtual environments in Python, which comes pre-installed with python. We highly recommend using this package, although others are available. See [our page](training_resources/python/virtual-environments/venv.md) for advice on using it. +# ## Conda @@ -35,3 +22,21 @@ You can read more about git on our [introduction to Git](training_resources/git/ An IDE (Integrated Development Environment) is a piece of software you can use to write code. You can write code anywhere that you can write plain text, but IDEs are designed to help with the process. It doesn't affect how the code will run, and you can move the code safely between IDEs. IDEs are packed with useful features like autocompletion, test suites, git integration, linting and more. We recommend Visual Studio Code, but you can also try PyCham, Spyder, or Eclipse - to name a few. + +## Pipeline + +A data pipeline is a series of steps or processes that are used to collect, process, and transform data from various sources into a format that can be easily used. The pipeline typically includes data ingestion, data cleaning, and some analysis. A good pipeline can automate work as well as improving the quality of the outcomes. + +## RAP + +RAP stands for Reproducible Analytical Pipelines. The term comes from UK public sector data scientists - you can [read the ONS description here](https://datasciencecampus.ons.gov.uk/capability/data-science-campus-faculty/reproducible-analytical-pipeline-journey/#:~:text=Reproducible%20Analytical%20Pipelines%20are%20programs,impressive%20efficiencies%20in%20your%20teams.). We also have a page on [why RAP is important](introduction_to_RAP/why_RAP_is_important.md) + +## Venv + +`venv` is a particular package for managing virtual environments in Python, which comes pre-installed with python. We highly recommend using this package, although others are available. See [our page](training_resources/python/virtual-environments/venv.md) for advice on using it. + +## Virtual Environments + +Virtual environments are a way to ensure that you have maximum control over the code that you're writing and how it will run. Many software packages interact with one another, and not always in a good or predictable way. Virtual environments allow you to develop code like it is being done on a completely clean, separate machine. In a virtual environment, you can install whatever packages you want, at whatever version, without worrying about affecting the other software or projects you might have on your computer. + +Best practice recommends that we create a different virtual environment for each project that we work on. \ No newline at end of file diff --git a/docs/images/JS_network.png b/docs/images/JS_network.png new file mode 100644 index 0000000..e66be2c Binary files /dev/null and b/docs/images/JS_network.png differ diff --git a/docs/images/chi-onwurah-pmq-rap.png b/docs/images/chi-onwurah-pmq-rap.png new file mode 100644 index 0000000..561bb17 Binary files /dev/null and b/docs/images/chi-onwurah-pmq-rap.png differ diff --git a/docs/images/clipart_copy.png b/docs/images/clipart_copy.png new file mode 100644 index 0000000..9e6710d Binary files /dev/null and b/docs/images/clipart_copy.png differ diff --git a/docs/images/clipart_magglass.png b/docs/images/clipart_magglass.png new file mode 100644 index 0000000..e0ddb72 Binary files /dev/null and b/docs/images/clipart_magglass.png differ diff --git a/docs/images/clipart_map.png b/docs/images/clipart_map.png new file mode 100644 index 0000000..299c483 Binary files /dev/null and b/docs/images/clipart_map.png differ diff --git a/docs/images/code_review.jpg b/docs/images/code_review.jpg new file mode 100644 index 0000000..83866f3 Binary files /dev/null and b/docs/images/code_review.jpg differ diff --git a/docs/images/copy_image.png b/docs/images/copy_image.png new file mode 100644 index 0000000..e89f7d5 Binary files /dev/null and b/docs/images/copy_image.png differ diff --git a/docs/images/flowers.png b/docs/images/flowers.png new file mode 100644 index 0000000..4295dc5 Binary files /dev/null and b/docs/images/flowers.png differ diff --git a/docs/images/magnifying_glass.png b/docs/images/magnifying_glass.png new file mode 100644 index 0000000..4e2b076 Binary files /dev/null and b/docs/images/magnifying_glass.png differ diff --git a/docs/images/manual_version_control.jpg b/docs/images/manual_version_control.jpg new file mode 100644 index 0000000..d7ba8be Binary files /dev/null and b/docs/images/manual_version_control.jpg differ diff --git a/docs/images/network.png b/docs/images/network.png new file mode 100644 index 0000000..f2299b9 Binary files /dev/null and b/docs/images/network.png differ diff --git a/docs/images/the-rap-journey-long-with-text.png b/docs/images/the-rap-journey-long-with-text.png new file mode 100644 index 0000000..03b91bc Binary files /dev/null and b/docs/images/the-rap-journey-long-with-text.png differ diff --git a/docs/images/your-country-needs-you-to-rap.jpg b/docs/images/your-country-needs-you-to-rap.jpg new file mode 100644 index 0000000..91b889e Binary files /dev/null and b/docs/images/your-country-needs-you-to-rap.jpg differ diff --git a/docs/implementing_RAP/accessibility-how-to.md b/docs/implementing_RAP/accessibility-how-to.md index dfeccae..ad971e7 100644 --- a/docs/implementing_RAP/accessibility-how-to.md +++ b/docs/implementing_RAP/accessibility-how-to.md @@ -1,4 +1,11 @@ -# Guidance on Accessibility +--- +title: Guidance on Accessibility + +tags: + - Accessibility +--- + +# > _The power of the Web is in its universality. Access by everyone regardless of disability is an essential aspect._
**Tim Berners-Lee, W3C Director and inventor of the World Wide Web** diff --git a/docs/implementing_RAP/code-review.md b/docs/implementing_RAP/code-review.md index 6ee4f89..b01e6bd 100644 --- a/docs/implementing_RAP/code-review.md +++ b/docs/implementing_RAP/code-review.md @@ -1,4 +1,12 @@ -# Code review +--- +title: Code review + +tags: + - Code reviews + - Workflow +--- + +# [![link to code quality comic image](https://imgs.xkcd.com/comics/code_quality.png)](https://xkcd.com/1513) diff --git a/docs/implementing_RAP/coding-best-practice.md b/docs/implementing_RAP/coding-best-practice.md index 2a95410..ce817e0 100644 --- a/docs/implementing_RAP/coding-best-practice.md +++ b/docs/implementing_RAP/coding-best-practice.md @@ -1,10 +1,14 @@ --- -#hide table of contents whitespace -hide: toc ---- +title: Coding Best Practice + +tags: + - Coding tips -# Coding Best Practice +hide: + - toc +--- +# You can find guidance on coding best practice at these pages: General Code Development diff --git a/docs/implementing_RAP/example-pipeline.md b/docs/implementing_RAP/example-pipeline.md index 4784383..ddc16fd 100644 --- a/docs/implementing_RAP/example-pipeline.md +++ b/docs/implementing_RAP/example-pipeline.md @@ -1,4 +1,17 @@ -# Example Pipeline (Python) +--- +title: Example Pipeline (Python) + +tags: + - Python + - Thin slice strategy + - Process mapping + - Project structure + - Loose coupling + - Open-source + - Coding tips +--- + +# !!! tip "TLDR" - This is an [example of a pipeline](https://github.com/NHSDigital/RAP_example_pipeline_python) made in Python with RAP principles in mind diff --git a/docs/implementing_RAP/how-to-publish-your-code-in-the-open.md b/docs/implementing_RAP/how-to-publish-your-code-in-the-open.md index 3b90285..b8c6cf2 100644 --- a/docs/implementing_RAP/how-to-publish-your-code-in-the-open.md +++ b/docs/implementing_RAP/how-to-publish-your-code-in-the-open.md @@ -1,4 +1,15 @@ -# How to Publish your Code in the Open +--- +title: How to Publish your Code in the Open + +tags: + - Publishing code + - Transparency + - Workflow + - GitHub + - GitHub topics +--- + +# In NHS Digital we have committed to publishing more and more of our code over time to **improve the transparency** of our analytical work. @@ -72,4 +83,4 @@ Once a new publication's repository is published on GitHub, feel free to update - [The benefits of coding in the open](https://gds.blog.gov.uk/2017/09/04/the-benefits-of-coding-in-the-open/) - [Open source repositories by the Government Digital Service](https://github.com/alphagov) -*NHS England is not affiliated with any of these websites or companies.* \ No newline at end of file +*NHS England is not affiliated with any of these websites or companies.* diff --git a/docs/implementing_RAP/notebooks_versus_ide_development.md b/docs/implementing_RAP/notebooks_versus_ide_development.md index f095300..6cecf2c 100644 --- a/docs/implementing_RAP/notebooks_versus_ide_development.md +++ b/docs/implementing_RAP/notebooks_versus_ide_development.md @@ -1,4 +1,19 @@ -# Development tools +--- +title: Development tools + +tags: + - Workflow + - Notebooks + - IDEs + - Jupyter + - Databricks + - Version control + - Git + - Testing + - Unit testing +--- + +# ## What is this guide for? diff --git a/docs/implementing_RAP/process_mapping.md b/docs/implementing_RAP/process_mapping.md index d5602ed..530d5d5 100644 --- a/docs/implementing_RAP/process_mapping.md +++ b/docs/implementing_RAP/process_mapping.md @@ -1,4 +1,13 @@ -# Processing Mapping +--- +title: Process Mapping + +tags: + - Process mapping + - Preparing for RAP + - RAP project management +--- + +# ## How to Describe your Processes @@ -130,4 +139,4 @@ There are loads of tools for process mapping though, and some options be [Miro][ [7]: ../our_RAP_service/thin-slice-strategy.md [8]:https://miro.com/ [9]:https://app.diagrams.net/ -[10]:https://www.lucidchart.com/pages/ \ No newline at end of file +[10]:https://www.lucidchart.com/pages/ diff --git a/docs/implementing_RAP/quality-assuring-analytical-outputs.md b/docs/implementing_RAP/quality-assuring-analytical-outputs.md index 9706974..a7985e4 100644 --- a/docs/implementing_RAP/quality-assuring-analytical-outputs.md +++ b/docs/implementing_RAP/quality-assuring-analytical-outputs.md @@ -1,4 +1,13 @@ -# Quality Assuring Analytical Outputs +--- +title: Quality Assuring Analytical Outputs + +tags: + - Preparing for RAP + - RAP project management + - Quality assurance +--- + +# ## RAP Quality Assurance Plan @@ -12,8 +21,6 @@ You can also create the QA log to audit if the QA plan has been followed – tra Here is a quality assurance checklist which we adapted from [ONS's Quality Assurance of Code for Analysis and Research](https://best-practice-and-impact.github.io/qa-of-code-guidance/checklist_higher.html) and updated to apply in Data Science team for some previous projects. You can select the relevant steps for your project depending on its complexity and the required level of quality assurance. -> (you can check the boxes below by adding an 'x' in between the square braces: `- [ ]` becomes `- [x]` in your markdown) - ### Governance and IG - [ ] Do we have an approved commission? diff --git a/docs/implementing_RAP/rap-readiness.md b/docs/implementing_RAP/rap-readiness.md index 923f17f..f63b912 100644 --- a/docs/implementing_RAP/rap-readiness.md +++ b/docs/implementing_RAP/rap-readiness.md @@ -1,4 +1,12 @@ -# Ready for RAP? +--- +title: Ready for RAP? + +tags: + - Preparing for RAP + - RAP project management +--- + +# RAPs' yield long-term benefits for the teams that use them. They are robust against errors, staff turnover, and changing outputs; they reduce human effort, computing time, and complexity; they save money through using open-source software, increasing re-usability, and lowering resource requirements. diff --git a/docs/implementing_RAP/skills_for_rap/git_for_rap.md b/docs/implementing_RAP/skills_for_rap/git_for_rap.md new file mode 100644 index 0000000..1329756 --- /dev/null +++ b/docs/implementing_RAP/skills_for_rap/git_for_rap.md @@ -0,0 +1,52 @@ +--- +title: Git Basics for RAP + +tags: + - Git + - Preparing for RAP + - Version control +--- + +# + +Before you embark on your RAP journey, we think it's best for you to have some basic knowledge and ability in using version control and specifically Git and GitHub. + +??? expand "Essential Git Skills for RAP Checklist" + **I can...** + + - [ ] Clone/Create a repository + - [ ] Pull changes + - [ ] Make a branch + - [ ] Checkout a branch + - [ ] Check the status of a branch + - [ ] Add changes to the staging area + - [ ] Commit changes + - [ ] Push changes + - [ ] Submit a Pull/Merge request + +??? expand "Test your Git Skills" + **Want to try out your Git skills or get some practice? Try our Git test:** + + - [ ] Create a repository in your GitHub account with a blank README.md file + - [ ] Add your name to the README.md file by creating a branch, adding your changes, committing, and pushing them. + - [ ] Create a Pull Request and approve it, merging your changes into the main branch. + + +=== "**Git Master**" + If you're a Git master and feel confident in your skills, great! Offer to buddy up with a Git Newbie to help them on their journey and get everyone quickly up to speed. + + +=== "**Git Apprentice**" + Used Git before, but you're not sure if your skills are up to par with what's needed for RAP? Try our checklist above of what we consider the basic Git ability needed and if you come across something you don't understand, go through our [list of common Git commands](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/introduction-to-git/#common-git-commands) to learn more. Practice your Git skills by doing our Git Test above. + + +=== "**Git Newbie**" + Wondering what on earth is a Git, and what does it have to do with pipelines? Read on to quickly get up to speed on all things Git! Once you've read through these pages, try out our Git Test above to put your new skills to the test. + + - [What is Git and Version Control?](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/introduction-to-git/) + - [Git Quick Start Guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/quick_start_guides/git_quick_start_guide/) + - [GitHub Quick Start Guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/quick_start_guides/github_quick_start_guide/) + - [Clone, Stage, Commit Walkthrough](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/git_walkthroughs/committing_work_walkthrough/) + - [Branching Walkthrough](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/git_walkthroughs/working_with_branches_walkthrough/) + - [Pull and Merge Request Walkthrough](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/git_walkthroughs/pull_and_merge_requests_walkthrough/) + - Feel free to reach out and ask for help if you're confused, the NHS England RAP MS Teams page (internal to NHS England) has lots of people willing to give support, or check out some other options listed on our [Support](/support) page. \ No newline at end of file diff --git a/docs/implementing_RAP/skills_for_rap/python_for_rap.md b/docs/implementing_RAP/skills_for_rap/python_for_rap.md new file mode 100644 index 0000000..1bf9d58 --- /dev/null +++ b/docs/implementing_RAP/skills_for_rap/python_for_rap.md @@ -0,0 +1,59 @@ +--- +title: Python for RAP + +tags: + - Python + - Preparing for RAP + - Coding tips +--- + +# + +Before you embark on your RAP journey, we think it's best for you to have some basic skills in the open source language that you are going to build the pipeline in. If that language is Python, read on to find out if you have the skills needed already. + +We have some guidance on our site for some of the more intermediate skills, linked in the checklist, but to learn the basics we recommend checking out the links at the bottom of the page. + + +??? expand "Essential Python Skills for RAP Checklist" + **I can...** + + - [ ] Use an Integrated Development Environment, like VSCode or Spyder, to run Python files + - [ ] Create print statements + - [ ] Use built in data types (strings, ints, floats, lists, arrays) to declare variables + - [ ] Perform basic arithmetic operations and string manipulation + - [ ] Create comments in my code to explain why I'm performing certain operations + - [ ] Use conditional logic in if/else/elif statements + - [ ] Use loops (for/while) for iterative processes + - [ ] Import packages, like numpy or pandas + - [ ] Set up [virtual environments](/training_resources/python/virtual-environments/why-use-virtual-environments/) to manage my packages + - [ ] Perform [simple data analysis operations](/training_resources/python/basic-python-data-analysis-operations/) using pandas, pyspark or numpy + - [ ] Read in data + - [ ] Extract column names and types + - [ ] Filter a dataframe based on a certain condition + - [ ] Join two dataframes together + - [ ] Group data based on a specific criteria + - [ ] Export data + - [ ] Wrap code up in a [function](/training_resources/python/python-functions/) so it can be reused + +??? expand "Desirable Python Skills for RAP Checklist" + **I can...** + + - [ ] Use [config files](/training_resources/python/config-files/) + - [ ] [Handle errors](/training_resources/python/logging-and-error-handling/) effectively + - [ ] Write and perform [unit tests](/training_resources/python/unit-testing/) + - [ ] [Refactor code](/training_resources/refactoring-guide/) to improve it + + +=== "**Python Master**" + If you've used Python before and are confident in your abilities- great! Be aware that some in your team might not have the skills you do and might need some extra help with the basics. Offer to buddy up with some of the Python Newbies if they're struggling so everyone can quickly get up to scratch. Check out the Desirable Skills checklist above in case there is something you could brush up on. + +=== "**Python Apprentice**" + Have you got some familiarity with Python but you're not sure if you know enough for RAP? Try out the essential skills checklist above - you will learn a lot from implementing RAP, but having these basic skills will allow everyone to get off to a flying start. You probably know more than you think! + +=== "**Python Newbie**" + Never even heard of Python? No worries, there is extensive Python training out there to get you off on the right foot. Don't be afraid to reach out to some Python Masters in your team if you need help understanding something - there's no such thing as a stupid question! Read our [intro to Python](/training_resources/python/intro-to-python/) guide to gain some high level overview, and then we recommend checking out some of the resources below. Come back and tick off what you know on the checklist so you can tailor your learning to the important stuff! + + - [Kaggle introduction to Python](https://www.kaggle.com/learn/python) + - [CodeAcademy Python Courses](https://www.codecademy.com/catalog/language/python) + - [Govt Analysis Function Introduction to Python](https://analysisfunction.civilservice.gov.uk/training/introduction-to-python/) + - [freecodecamp](https://www.freecodecamp.org), which has a course tailored to learning [data analysis with Python](https://www.freecodecamp.org/learn/data-analysis-with-python/) diff --git a/docs/implementing_RAP/skills_for_rap/r_for_rap.md b/docs/implementing_RAP/skills_for_rap/r_for_rap.md new file mode 100644 index 0000000..b8763ee --- /dev/null +++ b/docs/implementing_RAP/skills_for_rap/r_for_rap.md @@ -0,0 +1,54 @@ +--- +title: R for RAP + +tags: + - R + - Preparing for RAP + - Coding tips +--- + +# + +Before you embark on your RAP journey, we think it's best for you to have some basic skills in the open source language that you are going to build the pipeline in. If that language is R, read on to find out if you have the skills needed already. + +Unlike Python, we don't have that much training for using R as we mostly use Python, but [check out the links](/training_resources/R/) we have for some great guidance written by others. + + +??? expand "Essential R Skills for RAP Checklist" + **I can...** + + - [ ] Use an Integrated Development Environment, like RStudio or Visual Studio Code, to run R files + - [ ] Create print statements + - [ ] Use built in data types (numeric, integer, character, logical) to declare variables + - [ ] Perform basic arithmetic operations and string manipulation + - [ ] Create comments in my code to explain why I'm performing certain operations + - [ ] Use conditional logic in if/else/else if statements + - [ ] Use loops (for/while) for iterative processes + - [ ] Import libraries, like dplyr or ggplot2 + - [ ] Perform simple data analysis operations + - [ ] Read in data + - [ ] Extract column names and types + - [ ] Filter a dataframe based on a certain condition + - [ ] Join two dataframes together + - [ ] Group data based on a specific criteria + - [ ] Export data + - [ ] Wrap code up in a function so it can be reused + +??? expand "Desirable R Skills for RAP Checklist" + **I can...** + + - [ ] Use config files + - [ ] Handle errors effectively + - [ ] Write and perform unit tests + - [ ] [Refactor code](/training_resources/refactoring-guide/) to improve it + - [ ] Use package management tools like Packrat or Pacman + + +=== "**R Master**" + If you've used R before and are confident in your abilities- great! Be aware that some in your team might not have the skills you do and might need some extra help with the basics. Offer to buddy up with some of the R Newbies if they're struggling so everyone can quickly get up to scratch. Check out the Desirable Skills checklist above in case there is something you could brush up on. + +=== "**R Apprentice**" + Have you got some familiarity with R but you're not sure if you know enough for RAP? Try out the essential skills checklist above - you will learn a lot from implementing RAP, but having these basic skills will allow everyone to get off to a flying start. You probably know more than you think! + +=== "**R Newbie**" + Never even heard of R? No worries, there is extensive R training out there to get you off on the right foot. Don't be afraid to reach out to some R Masters in your team if you need help understanding something - there's no such thing as a stupid question! Check out our [links to R training](/training_resources/R/) and come back and tick off what you know on the checklist so you can tailor your learning to the important stuff! diff --git a/docs/implementing_RAP/technical-workflow.md b/docs/implementing_RAP/technical-workflow.md index c639691..38aa2d3 100644 --- a/docs/implementing_RAP/technical-workflow.md +++ b/docs/implementing_RAP/technical-workflow.md @@ -1,4 +1,16 @@ -# Technical Working Setup & Best Practices +--- +title: Technical Working Setup & Best Practices +summary: What's the best way to streamline your working practices and tools to improve your effiency? Read on for some tips... +tags: + - Workflow + - Project structure + - Git + - Linting + - Conda + - Anaconda +--- + +# > **Note:** This guide (and most of this repository) is written primarily for Python development but some parts are generally applicable to other languages. diff --git a/docs/implementing_RAP/tidy-data.md b/docs/implementing_RAP/tidy-data.md index 84abca4..f032672 100644 --- a/docs/implementing_RAP/tidy-data.md +++ b/docs/implementing_RAP/tidy-data.md @@ -1,4 +1,11 @@ -# Tidy Data +--- +title: Tidy Data + +tags: + - Tidy data +--- + +# Adopting inconsistent data formats leads to a huge amount of wasted effort and can actually lead to very complex code. By adopting **tidy data format** for your work you can both improve your service to users and simplify your own production pipeline. diff --git a/docs/implementing_RAP/tools.md b/docs/implementing_RAP/tools.md index 0311b39..c19cd60 100644 --- a/docs/implementing_RAP/tools.md +++ b/docs/implementing_RAP/tools.md @@ -1,4 +1,16 @@ -# Workflow tools explained +--- +title: Workflow tools explained + +tags: + - Workflow + - Git + - Python + - VS Code + - Anaconda + - Conda +--- + +# Getting data science tools configured to your needs and working together is a core part of any data science project. Learning how to troubleshoot problems with these tools quickly is an important skill. @@ -223,12 +235,8 @@ To work with this integration, you must install the Jupyter package in your base You can then create and run Jupyter-like code cells, defined within Python code using a `# %%` comment: ```Python -# %% -msg = "Hello World" print(msg) -# %% -msg = "Hello again" print(msg) ``` diff --git a/docs/implementing_RAP/updating-your-published-code.md b/docs/implementing_RAP/updating-your-published-code.md index bcf7aab..1344be7 100644 --- a/docs/implementing_RAP/updating-your-published-code.md +++ b/docs/implementing_RAP/updating-your-published-code.md @@ -1,4 +1,12 @@ -# How to update your published code +--- +title: How to update your published code + +tags: + - Publishing code + - Transparency +--- + +# ## Introduction diff --git a/docs/implementing_RAP/when-to-stop-coding.md b/docs/implementing_RAP/when-to-stop-coding.md index 798d7eb..2c42095 100644 --- a/docs/implementing_RAP/when-to-stop-coding.md +++ b/docs/implementing_RAP/when-to-stop-coding.md @@ -1,4 +1,14 @@ -# When to Stop Coding? +--- +title: When to Stop Coding? + +tags: + - Coding tips + - Refactoring + - Levels of RAP + - Testing +--- + +# !!! tip "TLDR" diff --git a/docs/introduction_to_RAP/RAP_in_health.md b/docs/introduction_to_RAP/RAP_in_health.md index 75fe8c6..3861aec 100644 --- a/docs/introduction_to_RAP/RAP_in_health.md +++ b/docs/introduction_to_RAP/RAP_in_health.md @@ -1,4 +1,11 @@ -#RAP in Health +--- +title: RAP in Health +tags: + - RAP Playbook + - Policy +--- + +# When applying RAP to health data, unique challenges may arise, particularly concerning data safety. To support those currently engaged or interested in RAP within healthcare, NHS England has launched the [Health RAP Playbook]. This resource provides valuable advice and guidance tailored for individuals working specifically in health analytics. @@ -18,4 +25,4 @@ It's worth noting that the [Health RAP Playbook] is currently in its **alpha tes The team behind the playbook encourage users to [provide feedback] on the resource in order to improve the site. [Health RAP Playbook]: https://nhsengland.github.io/Health-RAP-Playbook-Alpha/ -[provide feedback]: mailto:datascience@nhs.net \ No newline at end of file +[provide feedback]: mailto:datascience@nhs.net diff --git a/docs/introduction_to_RAP/gov-policy-on-rap.md b/docs/introduction_to_RAP/gov-policy-on-rap.md new file mode 100644 index 0000000..1ce0347 --- /dev/null +++ b/docs/introduction_to_RAP/gov-policy-on-rap.md @@ -0,0 +1,109 @@ +# What does government policy say about RAP? + +!!! tip "TLDR" + - RAP is not just a good idea - it's fast becoming the **required** way of working with data + - The government, the Civil Service, and the NHS have all released documents pushing for the RAP way of working + - These include The Goldacre Report, Data Saves Lives, and the Civil Service RAP Strategy + +??? success "Pre-requisites" + + |Pre-requisite | Importance | Note | + |--------------|------------|------| + |[What is RAP?](what_is_RAP.md)|Helpful|Knowledge of RAP principles will help you understand why the government is pushing for it| + +![Image showing a military officer pointing at you. The caption reads: "YOUR COUNTRY NEEDS YOU - TO RAP"](../images/your-country-needs-you-to-rap.jpg) +{: .align-centre} + +The government wants you to adopt RAP principles in your analytical work. + +This is highlighted in a number of policy, strategy, review, and other documents pushing for the use of RAP in the UK public sector. + +Read on to find out how RAP is not just a good idea - it's fast becoming the **required** way of working with data. + +## The Goldacre review + +[Better, broader, safer: using health data for research and analysis](https://www.gov.uk/government/publications/better-broader-safer-using-health-data-for-research-and-analysis/better-broader-safer-using-health-data-for-research-and-analysis) aka "The Goldacre Review" is a 2022 report led by Prof. Ben Goldacre, offering guidance on how the NHS can better use, manage, store, and access its data. The 220-page report urges the adoption of RAP working principles, and offers detailed recommendations on how to do so. The report was fully supported by the Secretary of State for Health. + +**Key quotes:** + +The NHS can and should rapidly adopt RAP working practices, both for service analysis and for research. +{: .pquote .pquote--blue .pquote--inline .pquote--serif} + +- The principles of RAP are excellent, well thought through, and reflect a strong basic **minimum standard**. +- Promote and resource “Reproducible Analytical Pipelines” (RAP, a set of best practices and training created in GDS and ONS) as the minimum standard for academic and NHS data analysis. +- Data Controllers should require RAP and open code sharing from data users. +- TREs (Trusted Research Environments) themselves should be built on principles of RAP and open code. + +## Parliamentary Questions + +On 10 May 2022, Labour MP for Newcastle Upon Tyne Central [tabled a written question for parliament](https://questions-statements.parliament.uk/written-questions/detail/2022-05-10/243) on the use of RAP to the Department of Health and Social Care: + +*"To ask the Secretary of State for Health and Social Care, what steps he taking to promote and resource reproducible analytical pipelines as the minimum standard for academic and NHS data analysis, as recommended in the Goldacre review."* + +The question was answered on 24 May 2022 by Gillian Keegan, Conservative MP for Chichester, who replied: + +*"We are currently considering the recommendations of the Goldacre review. However, many of the review’s recommendations are aligned with existing programmes, such as facilitating Reproducible Analytical Pipelines (RAP) within the National Health Service and promoting RAP through NHS analyst communities."* + +![Screenshot of a parliamentary question in which Chi Onwurah, Labour MP for Newcastle Upon Tyne Central asks "To ask the Secretary of State for Health and Social Care, what steps he taking to promote and resource reproducible analytical pipelines as the minimum standard for academic and NHS data analysis, as recommended in the Goldacre review." The reply from Gillian Keegan, Conservative MP for Chichester was: "We are currently considering the recommendations of the Goldacre review. However, many of the review’s recommendations are aligned with existing programmes, such as facilitating Reproducible Analytical Pipelines (RAP) within the National Health Service and promoting RAP through NHS analyst communities." ](../images/chi-onwurah-pmq-rap.png){width="650"} +{: .align-centre} + +## Data saves lives + +[Data saves lives: reshaping health and social care with data](https://www.gov.uk/government/publications/data-saves-lives-reshaping-health-and-social-care-with-data/data-saves-lives-reshaping-health-and-social-care-with-data), is a policy document released in 2022 laying out the government's plan to use data in healthcare. + +**Key quotes:** + +Recommendation 7: promote and resource ‘Reproducible Analytical Pathways’ (RAP, a set of best practices and training created in ONS) as the minimum standard for academic and NHS data analysis +{: .pquote .pquote--blue .pquote--inline .pquote--serif} + +- ...we will explore the impact and benefits of modern, open working methods of data management and analysis such as Reproducible Analytical Pipelines (RAPs), which is particularly recommended by the Goldacre review +- As we foster an increasingly open culture, **we will progressively ask for more open-source ways of working** in our procurement and contracts, with clear policies that build towards open by default across the NHS. +- We will begin to make new source code that we produce or commission **open and reusable by default** (with clear exceptions) and publish it under appropriate licences to encourage further innovation +- All accredited NHS secure data environments must adhere to a policy of open-working, support code-sharing and facilitate use of technology that supports this, such as RAPs. +- Recommendation 17: embrace modern, open working methods for NHS data analysis by **committing to Reproducible Analytical Pipelines (RAP) as the core working practice** that must be supported by all platforms and teams; make this a core focus of NHS analyst training. + +## Secure Data Environment policy guidelines + +The [Secure data environment for NHS health and social care data](https://www.gov.uk/government/publications/secure-data-environment-policy-guidelines/secure-data-environment-for-nhs-health-and-social-care-data-policy-guidelines) policy guidelines expand on the recommendations made in Data Saves Lives pertaining to how secure data environments should be implemented. It includes RAP in its definition of the "Five Safes" - specifically under Safe Projects, as publishing our code is needed for the public to understand how we're using their data. + +**Key quotes:** + +Code developed in secure data environments must be published in the open unless there is a specific rationale for not doing so. +{: .pquote .pquote--blue .pquote--inline .pquote--serif} + +- Secure data environments must support open working, ensuring that code developed in these environments is reusable [for example] using the Reproducible Analytical Pipelines (RAP) strategy. + +## Civil service RAP strategy + +The Civil Service RAP team produced a [strategy report](https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/) outlining why and how government departments should adopt RAP working principles. + +**Key quotes:** + +Embedding RAP as the default approach to analysis in government is an essential step on the way to digital transformation of analysis. +{: .pquote .pquote--blue .pquote--inline .pquote--serif} + +- ...we know that giving analysts the capability needed for RAP **improves efficiency** and the **quality** of their products. +- We have shown that RAP delivers improved efficiency and quality for analysis. RAP forms a core part of digital transformation and supports the delivery of other government initiatives. + +## Government open source guidance + +One of the key RAP principles is transparency - sharing and reusing code. This speeds up the development of analytical processes, as you're not continually reinventing the wheel. If an existing solution exists elsewhere, why write one again from scratch? + +In 2017, the government released guidance on this in a document called [When code should be open or closed](https://www.gov.uk/government/publications/open-source-guidance/when-code-should-be-open-or-closed). Spoiler alert: it should almost always be open. + +**Key quotes:** + +- You should keep some data and code closed, including: keys and credentials, algorithms used to detect fraud, unreleased policy. +- **You should open all other code.** + +## Proposed NHS open source policy + +Colleagues in the NHS created a [proposed policy](https://github.com/nhsx/open-source-policy/blob/main/open-source-policy.md) to inform why, how and when staff across NHS England should publish their code openly. + +**Key quotes:** + +All new source code that we produce or commission should be open and reusable by default: such that anyone can freely access, use, modify, and share the relevant code for any purpose. +{: .pquote .pquote--blue .pquote--inline .pquote--serif} + +- It is almost impossible for staff across the NHS to communicate either best practice or effective solutions to common problems without a framework in place. Open source code offers that framework. +- Any open code can be reused by our developers to **reduce costs**, avoid duplication of effort, generally **increase staff efficiency**, make system changes more quickly and pursue the best approaches, not just those locally available. diff --git a/docs/introduction_to_RAP/history_of_RAP.md b/docs/introduction_to_RAP/history_of_RAP.md index bc5159b..bffcf7c 100644 --- a/docs/introduction_to_RAP/history_of_RAP.md +++ b/docs/introduction_to_RAP/history_of_RAP.md @@ -1,5 +1,13 @@ -# The History of RAP -We thought it was worth laying out some of the key moments in RAP history - thanks to colleagues from NHS England and the Government Data Science Community for filling in the gaps! +--- +title: The History of RAP + +tags: + - History + - Policy +--- + +# + ## Pre-RAP - The promotion of open code **2017** @@ -25,6 +33,7 @@ We thought it was worth laying out some of the key moments in RAP history - than * [RAP Companion](https://ukgovdatascience.github.io/rap_companion/) * [Introduction to RAP course by the Govt Analysis Function](https://analysisfunction.civilservice.gov.uk/training/introduction-to-reproducible-analytical-pipelines-rap/) * [ONS Data Science Campus - RAP journey training course](https://datasciencecampus.ons.gov.uk/capability/data-science-campus-faculty/reproducible-analytical-pipeline-journey/) + * **Matthew Upson** publishes his article [Why Government needs sustainable software too](https://www.software.ac.uk/blog/why-government-needs-sustainable-software-too) on the Software Sustainability Institute's website. **2020** : Flourishing of blog posts in 2020s, e.g. [Rappers Delight](https://dataingovernment.blog.gov.uk/2020/03/24/rappers-delight/) @@ -51,4 +60,4 @@ We thought it was worth laying out some of the key moments in RAP history - than **2023** : [ONS publishes their RAP implementation plan, followed by other departments](https://analysisfunction.civilservice.gov.uk/support/reproducible-analytical-pipelines/departmental-rap-plans/) -: The [Health RAP Playbook (Alpha)](https://nhsengland.github.io/Health-RAP-Playbook-Alpha/) is launched \ No newline at end of file +: The [Health RAP Playbook (Alpha)](https://nhsengland.github.io/Health-RAP-Playbook-Alpha/) is launched diff --git a/docs/introduction_to_RAP/levels_of_RAP.md b/docs/introduction_to_RAP/levels_of_RAP.md index 0f2d077..b822c2d 100644 --- a/docs/introduction_to_RAP/levels_of_RAP.md +++ b/docs/introduction_to_RAP/levels_of_RAP.md @@ -1,4 +1,12 @@ -# Levels of RAP +--- +title: Levels of RAP + +tags: + - Levels of RAP + - Preparing for RAP +--- + +# To help analyst teams implement RAP, we've created a maturity framework we call the 'levels of RAP'. diff --git a/docs/introduction_to_RAP/what-is-open-source.md b/docs/introduction_to_RAP/what-is-open-source.md index 8500fa5..065d08e 100644 --- a/docs/introduction_to_RAP/what-is-open-source.md +++ b/docs/introduction_to_RAP/what-is-open-source.md @@ -1,4 +1,11 @@ -# What is open source? +--- +title: What is open source? + +tags: + - Open-source +--- + +# !!! tip "TLDR" diff --git a/docs/introduction_to_RAP/what_is_RAP.md b/docs/introduction_to_RAP/what_is_RAP.md index f4f9286..0482f9e 100644 --- a/docs/introduction_to_RAP/what_is_RAP.md +++ b/docs/introduction_to_RAP/what_is_RAP.md @@ -1,146 +1,240 @@ -# What is RAP? +--- +title: What is RAP? -In a nutshell, a Reproducible Analytical Pipeline (RAP) is a system that automates the process of ingesting, processing, modelling, and reporting data. +tags: + - Automation + - Loose coupling + - Separation of concerns + - Transparency + - Open-source + - Version control + - Git + - Code reviews + - Testing +--- -But you’ll also hear the term “RAP” used to describe the principles and working practices that enable us to build pipelines in this way. +# -## The non-RAP way +!!! tip "TLDR" -To explain what RAP is, it might be useful to look at the current state of play that we are trying to move beyond with RAP. -Imagine a publication process that doesn’t fit the description in the section above. It might involve: + RAP is a way of creating analytical processes that are: -1. Running queries against a data store to retrieve, join, and clean the data -2. Exporting the data and importing it into another statistical or analysis package -3. Creating some statistics/outputs -4. Copying these to an Excel/Word/PDF report + - More efficient + - More robust + - More transparent -Of course, every publication will be different, and there will often be multiple strands of work going on that converge on the final report. But the gist is, a processes requiring a lot of manual steps, and usually using proprietary software. +RAP (Reproducible Analytical Pipelines) is a set of principles and working practices that help you create faster, more robust, and more transparent analytical processes. -This has several downsides such as: +You can learn more about the history of RAP and why it was devised here: [The History of RAP](history_of_RAP.md). -1. **Time**: manual steps are time consuming, not only to carry out, but because they introduce a point where further manual quality assurance (QA) is needed -2. **Potential for error**: manual steps introduce the possibility of human error -3. **Scalability**: If you need to add a new table or statistic to the publication, the process will take longer to complete +With RAP, we automate and streamline the process of ingesting, processing, modelling, and reporting data. -## RAP solves these problems (and others!) +## The RAP Journey -Imagine if the whole process described above, from retrieving data to creating the finished publication, could be completed in one click. That’s what RAP aims to do. +You can think of the implementing RAP like this: -In many cases, a RAP approach can: +![numbered illustration of the RAP process with the following images at each step. 1 - a straw hut. 2 - a blue print of a modern house. 3 - the fully built house. 4 - a pile of bricks from the house. 5 - other buildings that were built using the bricks](../images/the-rap-journey-long-with-text.png) -- Significantly speed up the process -- Reduce the risk of error by removing manual processing steps -- Increase robustness and improve QA through automated testing -- Increase transparency by making code public -- Create a modular, re-usable codebase that is easy to extend or modify -- Build a library of code that is easily shareable across projects +Many teams reach Step 3 and stop there, but it's important to continue on to Steps 4 and 5 - sharing and reusing code - to gain the full benefits of RAP. -We go into more detail on the benefits of RAP on our [“Why RAP is important”](./why_RAP_is_important.md) page. +## RAP Principles -Let’s now have a look at the tools, principles, and working practices used to achieve this. +The RAP way of working isn't fixed and will evolve in line with new technologies and practices. Below are some of the core RAP principles at this time: -## RAP Principles +### Automation + +Anything that you do manually is time consuming, costly and prone to human error. For example, say you have some data that you export from a database table into Excel, and then you use vlookups to pull some reference data into it from another file. In this case, why not upload the reference data to the database, and update your script to do this automatically? Then that's one job you'll never have to do again! + +One of the first things you should do is identify any manual steps in your process, and automate them if possible. [Process mapping](../implementing_RAP/process_mapping.md) is a good way to do this. + +**Advantages:** -The RAP way of working isn't fixed and will evolve in line with new technologies and practices. But here are some of the core RAP principles at this time: +- **Faster processes**: Much faster to get the computer to do all the work! +- **More resource**: Free up analyst time to work on other things +- **Accuracy**: Reduces the risk of human error -## Modular, re-usable code +### Modular, re-usable code + +Code should be written in modular blocks that are essentially independent of each other. + +A non-software example might help explain this concept - imagine that your mouse was hard-wired into your computer. If you wanted a new one, it'd be a lot of work to change it, and a lot could go wrong in the process! As it is, your computer and your mouse are separate components with a standardised interface, meaning you can easily chop and change them. + +It's possible to write code in a similar way. This is sometimes called "loose coupling". ![image three overlapping circles with the words "Tight coupling" below them, next to three circles which do not overlap and the words "Loose coupling" below them](../images/tight-loose-coupling.png) -Code should be written in modular blocks that are essentially independent of each other. This is sometimes called "loose coupling". +Say you have process that imports some data, generates some counts, applies a suppression rule, and then exports the output. + +If you do this all in one script, we would say that these four processes are tightly coupled. They are not separate modules, and each part cannot run without the other parts. -There are many advantages of this: +A better way would be to move these four processes into separate [functions](../training_resources/python/python-functions.md), which would be contained and organised in a logical [project structure](../training_resources/python/project-structure-and-packaging.md): -- If a particular manipulation or calculation happens multiple times in a process, we can wrap it into a function and then call that function each time, rather than repeating the same bit of code in several places -- These blocks can then be re-used in other projects -- When processes are written in this way, it's much easier to "slot in" other people's code +* `import_data()` +* `generate_counts()` +* `apply_suppression()` +* `export_data()` -## Transparency – publishing code +Note that each function just does one thing - this is called "separation of concerns". + +Then we simply pass the output of each function to the next. None of the functions depend on any other to work. For example, the `generate_counts()` function doesn't depend on `import_data()`. It doesn't even know that it exists. We could use a completely different `import_data()` function, or just pass a static CSV file. As long as it receives the data, it will work. So we say these functions are loosely coupled. + +**Advantages:** + +- **Avoid repetition**: For example if you need to suppress counts in multiple places in your script, you can just call the function each time - no need to copy and paste the same code in several places +- **Re-usability**: Easier to re-use code in other work, speeding up the development of future projects +- **Integration**: Easier to "slot in" other people's functions - no need to reinvent the wheel if a solution already exists! +- **Faster on-boarding**: Code is easier to understand, so new users can get up to speed faster +- **Easier to test**: Testing is much easier to implement, improving the robustness and accuracy of the process +- **Load balancing**: In cloud architecture, it's easier to scale out computing resources for intensive modules and scale back in for the lighter ones + +### Transparency + +The Government's [Digital Service Standard](https://www.gov.uk/service-manual/service-standard) 12th principle states that all publicly funded code should be open, reusable and available under appropriate licences. To achieve this transparency, we can save our code to public-facing repositories like GitHub. ![image of some python code with an arrow going from the code and pointing to an image of a cloud](../images/publishing-code.png) -Publishing code is not appropriate in all situations (where it can affect a system’s security, for instance), and precautions must be taken to ensure that no sensitive information is published. But in many cases, there is no risk to publishing the code used to produce a publication. +Learn more in our guide on [publishing your code](../implementing_RAP/how-to-publish-your-code-in-the-open.md). -By making our code public, we: +**Advantages:** -- Improve public trust through transparency of our processes -- Enable more people to check code, report mistakes, and offer suggestions -- Help other teams improve their own processes by re-using our code +- **Improve public trust**: through transparency of our processes +- **Identify mistakes**: More eyes on the code makes it more likely that mistakes will be spotted +- **Feedback**: Readers of the code are able to suggest and make improvements to it +- **Collaboration**: Increases collaboration and knowledge sharing +- **Faster on-boarding**: No time is wasted on requesting permissions or access to code repositories +- **Consistent standards**: Easier to share and align on standards across the health sector +- **Code quality**: Knowing that your code will be published, tends to improve the quality of the code that you write +- **Re-usability**: Help other teams, internal and external, improve their processes by re-using our code +- **Alignment with government policy**: A [number] of [government] [policy] [documents] and [reviews] are asking for this -## Use open-source tools +### Use open-source tools ![image showing the logos for Python, R, and Apache Spark](../images/python-r-spark-logos.PNG) -Rather than proprietary software packages, use open-source software and programming languages like Python and R. Open-source basically means it's freely available for anyone to use. This brings several advantages: +Rather than proprietary software packages, use open-source software and programming languages like Python and R. Open-source basically means it's freely available for anyone to use. + +**Advantages**: - **Free**: open-source tools are completely free to use - **Support**: they tend to have a very active community of helpful people to seek advice and support from - **Reusability**: anyone can run open-source code, you don’t need to be a fellow user of a proprietary system. This makes it easier to share work across teams and organisations -- **Other people's code**: Reusability goes both ways! We can take advantage of the libraries, packages, and other code made by others +- **Use other people's code**: Reusability goes both ways! We can take advantage of the libraries, packages, and other code made by others - **Flexibility**: open-source programming languages can cover many bases – data connections, querying, processing, producing statistics, creating visualisations, building reports, and so on. This reduces the number of points where data is moved from one system to another -## Version control +### Version control + +When you're working with code, you'll often want to keep track of the changes you make. That way, you can always go back to your previous version if your current changes don't work out. Doing this manually can lead to situations like the following: + +![image of a woman working at a computer looking frustrated, with documents flying through the air](../images/manual_version_control.jpg) -![image showing the git logo](../images/git-logo.png) +Version control software, such as Git, fixes this. It enables you to keep a detailed history of code changes, and easily roll back to previous versions. It also makes for easy collaboration, even when multiple people are working on the same file. -Version control systems have become essential to anyone who writes code as part of their job. Where there’s code, there’s version control, and a program called “Git” is the most popular tool, although there are others. +Version control systems have become essential to anyone who writes code as part of their job. If you need some help getting started, have a look at our [git guidance](../training_resources/git/introduction-to-git.md). -Version control systems: +**Advantages:** -- Create an audit trail of changes made to the process, logging what changes were made when and by whom -- Enable users to revert changes, or roll back to previous versions, such as when a bug is found -- Make collaboration on coding projects much easier -- Make it easier to peer-review code for quality and reliability -- Enable code to be shared more easily, using cloud services like GitHub or GitLab. -- Enable us to automate certain parts of the development workflow, for example, auto-formatting code to make sure it is written in a consistent +- **Auditability**: Create an audit trail of changes made, logging what changes were made when and by whom +- **Roll backs**: Enable users to revert changes, or roll back to previous versions +- **Enhanced collaboration**: Makes collaboration on coding projects much easier +- **Code quality**: Makes it easier to peer-review code for quality and reliability +- **Transparency**: Enables code to be shared more easily, using cloud services like GitHub or GitLab. +- **Automation**: Automate parts of the development workflow, for example, running tests or auto-formatting code -## Good coding practices +### Good coding practices -In RAP projects we should aim to write high-quality code that follows a logical structure, and is well-commented and documented. This makes it easier to: +In RAP projects we should aim to write high-quality code that follows a logical structure, and is well-commented and documented. You can learn more about good coding practices in our [Python programming guide](../training_resources/python/intro-to-python.md) and our [refactoring guide](../training_resources/refactoring-guide.md). -- Read and understand the code – not just for third parties but also new team members -- Extend and modify the process -- Share our work with other teams -- Integrate code written by other teams into ours, should they have a better way of doing something +**Advantages:** -## Testing +- **Readability**: Easier to read and understand the code – not just for third parties but also new team members +- **Extensibility**: Easier to extend and modify the process +- **Integration**: Easier to integrate code written by other people + +### Testing + +Testing in a coding context means setting up automated procedures that check the code to find any mistakes, making sure it works well and does what it's supposed to do. ![illustration of a computer monitor with a magnifying glass over it, revealing a bug](../images/testing.jpg) -To help ensure the robustness and accuracy of the pipeline, we can set up automated tests within the code. +This is one reason we should write code in modular functions. Each function will have a singular purpose, so we can test them in isolation, and when working together as a whole. If you find it difficult to know where to start in writing tests for your project, that may be a sign that your code isn't modular enough (see the section above on [writing modular code](#modular-re-usable-code)) -Each function within the code should have its own unit test, to make sure it does what we expect it to do. Our unit tests can include a range of different input values, including edge cases, and some things we might not expect it to receive (such as sending text to a function that processes numbers). This is another reason to write code in the modular blocks described earlier - they are easier to test. +The key here is that the tests are automated. In one click we can test the entire codebase. It's like a safety net, helping us feel confident that changes to one area of the code haven't had a knock-on effect anywhere else. -Once we've completed the pipeline, we can also perform backtests. This is where we compare the results of the new pipeline with the known results from a previous run of the old process. In this way, we can make sure it's producing exactly the right output before we switch over. +To learn more, see our [guidance on testing](../training_resources/python/unit-testing.md). -## Peer Review +**Advantages**: -When changes are made to the code, these should almost always be reviewed by another team member, even if the new code is passing all the tests that have been created for it. +- **Spotting mistakes**: Testing helps find and fix errors or bugs +- **Enhanced reliability**: Ensures that the process operates reliably under different conditions +- **Accuracy**: By reducing the risk of errors, we help ensure that the figures we release are accurate +- **Time saving**: It is usually easier to identify and fix issues early in development -This is because even when code is working correctly, there may be a way to make the code more efficient, or to make it easier to read. Furthermore, the tests will need to be checked to ensure that they are truly testing what they are supposed to. +### Peer Review -This is another reason to use version control systems like Git and an accompanying cloud service like GitHub - they make the peer review and approval process much easier. +Peer review involves collaborative examination of code and outputs by team members to ensure quality, correctness, and adherence to best practices. -## The RAP Journey +![image of a man in an office at a desk, looking at code on his computer](../images/code_review.jpg) -You can think of the implementing RAP like this: +When changes are made to the code, these should almost always be reviewed by another team member - even if the new code is passing all the tests that have been created for it. + +This is because even when code is working correctly, there may be a way to make the code more efficient, or to make it easier to read. Furthermore, the tests themselves should be checked to ensure that they are truly testing what they are supposed to. + +This is another reason to use version control systems like Git and an accompanying cloud service like GitHub - they make the peer review and approval process much easier. + +Learn more in our page on [code reviews](../implementing_RAP/code-review.md). The page on [publishing your code](../implementing_RAP/how-to-publish-your-code-in-the-open.md) also describes a recommended code review process when making your code public. + +**Advantages**: + +- **Accuracy**: Increases accuracy and performance of code +- **Identify mistakes**: Reduces risk of error +- **Code quality**: Ensure code is written to a high standard +- **Training**: Useful development exercise to help team members learn good practices + +## The non-RAP way + +Let's compare this to the current state of play that we are trying to move beyond with RAP. Imagine a publication process that doesn’t use the principles above. It might involve: + +1. Manually running queries against a data store to retrieve, join, and clean the data +2. Exporting the data and manually importing it into another statistical or analysis package +3. Creating some statistics/outputs +4. Manually copying these to an Excel/Word/PDF report + +Of course, every publication will be different, and there will often be multiple strands of work going on that converge on the final report. But the gist is, a processes requiring a lot of manual steps, and usually using proprietary software. -![numbered illustration of the RAP process with the following images at each step. 1 - a straw hut. 2 - a blue print of a modern house. 3 - the fully built house. 4 - a pile of bricks from the house. 5 - other buildings that were built using the bricks](../images/rap-building-blocks.png) +This has several downsides such as: + +1. **Time consuming**: manual steps are time consuming, not only to carry out, but because they introduce a point where further manual quality assurance (QA) is needed +2. **Potential for error**: Fiddly manual steps introduce the possibility of human error. Lack of tests mean we can't be completely sure how future changes affect the old code +3. **Scalability**: If you need to add a new table or statistic to the publication, the process will take longer to complete +4. **Difficult to extend**: Code written on in a non-modular way using proprietary systems is harder to extend when changes need to be made + +## RAP solves these problems (and others!) + +Imagine if the whole process described above, from retrieving data to creating the finished publication, could be completed in one click. That’s what RAP aims to do. + +In many cases, a RAP approach can: -These stages represent: +- Significantly speed up the process +- Reduce the risk of error by removing manual processing steps +- Increase robustness and improve QA through automated testing +- Increase transparency by making code public +- Create a modular, re-usable codebase that is easy to extend or modify +- Build a library of code that is easily shareable across projects -1. The existing manual pipeline -2. [Create a blueprint](../implementing_RAP/process_mapping.md) for the new RAP-ified pipeline -3. Build the new pipeline -4. Identify reusable components -5. Build future pipelines with the help of these reusable compenents +We'll go into more detail on the benefits of RAP on our [“Why RAP is important”](./why_RAP_is_important.md) page. ## It’s not all or nothing! -It’s not necessary to make all these changes at once, or at all! +It’s not necessary to make all these changes at once! You can move along the RAP journey in stages, and you’ll see benefits with each step. Start simple and build up over time. -We have a page on [Levels of RAP](./levels_of_RAP.md) which explains more about this, and some guides on the skills and training a team might need along the journey. +We have a page on [Levels of RAP](./levels_of_RAP.md) which explains more about this, with three levels - Baseline, Silver, and Gold. Successful implementation of RAP often begins by starting with Baseline and building up from there. In the beginning, you’ll probably be best focusing on areas that reduce manual work for your analysts, freeing up their time so they can focus on other things. + +[number]: https://www.gov.uk/government/publications/data-saves-lives-reshaping-health-and-social-care-with-data/ +[government]: https://www.gov.uk/government/publications/secure-data-environment-policy-guidelines/ +[documents]: https://analysisfunction.civilservice.gov.uk/policy-store/reproducible-analytical-pipelines-strategy/ +[reviews]: https://www.gov.uk/government/publications/better-broader-safer-using-health-data-for-research-and-analysis/better-broader-safer-using-health-data-for-research-and-analysis/ +[policy]: https://www.gov.uk/government/publications/better-broader-safer-using-health-data-for-research-and-analysis/better-broader-safer-using-health-data-for-research-and-analysis/ diff --git a/docs/introduction_to_RAP/why_RAP_is_important.md b/docs/introduction_to_RAP/why_RAP_is_important.md index 9914a1d..5db5357 100644 --- a/docs/introduction_to_RAP/why_RAP_is_important.md +++ b/docs/introduction_to_RAP/why_RAP_is_important.md @@ -1,5 +1,12 @@ -# Why RAP is important +--- +title: Why RAP is important +tags: + - Benefits of RAP + - Policy +--- + +# Scientific research has increasingly relied on code to conduct complex statistical analyses in recent years. As code has become ubiquitous, new ways of working have emerged to ensure that scientific findings are rigorous and reproducible. These practices have been applied to government analytic work under the banner of **Reproducible Analytical Pipelines (RAP)**. RAPs bring together a number of good practices to help ensure all published statistics meet the highest standards of transparency and reproducibility. diff --git a/docs/our_RAP_service/building_team_capability.md b/docs/our_RAP_service/building_team_capability.md index 8c1683c..72d8b3e 100644 --- a/docs/our_RAP_service/building_team_capability.md +++ b/docs/our_RAP_service/building_team_capability.md @@ -1,4 +1,11 @@ -# How to prepare my team for RAP +--- +title: How to prepare my team for RAP + +tags: + - Preparing for RAP +--- + +# [![Comic image of team chat](https://imgs.xkcd.com/comics/team_chat.png)](https://xkcd.com/1782) diff --git a/docs/our_RAP_service/index.md b/docs/our_RAP_service/index.md index 53803ba..edfe6eb 100644 --- a/docs/our_RAP_service/index.md +++ b/docs/our_RAP_service/index.md @@ -1,4 +1,11 @@ -# Our RAP Service +--- +title: Our RAP Service + +tags: + - RAP engagements +--- + +# Following the recommendations in the [Overcoming Barriers to RAP](https://osr.statisticsauthority.gov.uk/publication/reproducible-analytical-pipelines-overcoming-barriers-to-adoption/) report, we have set up a central RAP team to coordinate efforts and set standards across NHS Digital. diff --git a/docs/our_RAP_service/programme-level-reporting.md b/docs/our_RAP_service/programme-level-reporting.md index b077bbb..e8a9efb 100644 --- a/docs/our_RAP_service/programme-level-reporting.md +++ b/docs/our_RAP_service/programme-level-reporting.md @@ -1,4 +1,11 @@ -# Programme level reporting +--- +title: Programme level reporting + +tags: + - RAP engagements +--- + +# The RAP team at NHS England offers a service to support teams to reach their RAP goals. We try to avoid being put in situations where we are chasing teams to complete RAP work - the dynamic works better if teams choose to approach us. Nevertheless, it is important for us to track the engagements that we take on so that we can account for our work. diff --git a/docs/our_RAP_service/rap-pre-engagement-questionnaire.md b/docs/our_RAP_service/rap-pre-engagement-questionnaire.md index 8fd2c74..079b4aa 100644 --- a/docs/our_RAP_service/rap-pre-engagement-questionnaire.md +++ b/docs/our_RAP_service/rap-pre-engagement-questionnaire.md @@ -1,4 +1,12 @@ -# RAP Pre-engagement Questionnaire +--- +title: RAP Pre-engagement Questionnaire + +tags: + - Preparing for RAP + - RAP engagements +--- + +# !!! tip "TLDR" - These questions will help you plan out your RAP engagement. diff --git a/docs/our_RAP_service/service-design-and-user-research.md b/docs/our_RAP_service/service-design-and-user-research.md index 004390f..8793ec3 100644 --- a/docs/our_RAP_service/service-design-and-user-research.md +++ b/docs/our_RAP_service/service-design-and-user-research.md @@ -1,4 +1,9 @@ -# Designing a RAP service +--- +title: Designing a RAP service + +tags: + - RAP engagements +--- As the roll-out of RAP has progressed in NHS Digital, we have come to recognise that we are offering a service. This is quite a change in direction for a team that previously was focused on development. We've had to quickly reframe our approach to rolling out RAP in NHS England to reflect this change. @@ -144,6 +149,8 @@ Thank your participant for sharing their experience, knowledge, and time with yo --- +# + **Note-taking tips** It is important to make a record of the conversation that takes place. diff --git a/docs/our_RAP_service/support-models.md b/docs/our_RAP_service/support-models.md index 4391a9a..a7f93f1 100644 --- a/docs/our_RAP_service/support-models.md +++ b/docs/our_RAP_service/support-models.md @@ -1,4 +1,12 @@ -# RAP squad support models +--- +title: RAP squad support models + +tags: + - RAP engagements + - RAP champions +--- + +# Every team approaches RAP from a different starting position - different BAU pressures, different team make-up and coding proficiency, different delivery cadence. We have spent a lot of time trying to figure out how best to support teams to engage with RAP. One of our main take-aways is that the support model of the RAP team should be tailored to the context of the team being supported. diff --git a/docs/our_RAP_service/thin-slice-strategy.md b/docs/our_RAP_service/thin-slice-strategy.md index eddd262..9bf3e31 100644 --- a/docs/our_RAP_service/thin-slice-strategy.md +++ b/docs/our_RAP_service/thin-slice-strategy.md @@ -1,16 +1,32 @@ -# Thin-slice strategy for building RAP pipelines +--- +title: Thin-slice strategy for building pipelines + +tags: + - Implementing RAP + - Thin slice strategy + - Preparing for RAP + - RAP project management +--- + +!!! tip "TLDR" + The thin slice strategy consists of two stages, creating a version of the pipeline that outputs a single component and then expanding it to cover the entire product. + + - **Stage 1:** mapping out the current pipeline, identifying the smallest output and working to replicate it using open source methods. + - **Stage 2:** From that, the re-useable components are identified and the thin-slice is scaled out to cover the entire output. + - **Benefits:** This process allows your team to identify pipeline issues early in the project while progressing quickly on the project, obtaining small wins while building up comprehensive understanding of the pipeline + -The RAP team at NHS England often adopts a 'thin-slice' strategy when we work with teams to migrate legacy pipelines. This guide explains the rationale behind the approach and gives a rough outline of how it plays out in practice +The RAP team at NHS England often adopts a 'thin-slice' strategy for migrating legacy pipelines. -This approach is inspired by DevOps practices in software development where rather than tackling the whole project at once, you build the smallest possible bit of functionality end-to-end. +Inspired by DevOps practices in software development, the 'thin-slice' strategy involves building the smallest functional component end-to-end rather than tackling the whole project at once. -By building something end-to-end you get a lot of benefits: +By building something end-to-end you get a lot of benefits such as: -- You get to quickly demonstrate progress - good for morale and convincing leaders to keep the project going +- **Quick Progress Demonstration:** The approach allows for quick demonstration of progress, boosting morale and convincing leaders to support the ongoing project. -- You get to identify major problems at the beginning of the project - e.g. the database doesn't work or the firewall is blocking access. By uncovering these things early you are more likely to be able to solve them and avoid unpleasant surprises right before the end +- **Early Issue Identification:** Building the end-to-end process helps identify major problems at the project's outset, such as database issues or the firewall blocking access, enabling timely solutions and preventing last-minute surprises. -- You get to understand the shape of the problem. When you first start on a project, you don't know what you are doing. If you work through the project in a linear sequence - then you finally understand how all the parts fit together right at the very end. That leads to poor design. By building something end-to-end you can see how all the parts fit together and make radical improvements and iterations at very low cost. +- **Comprehensive Problem Understanding:** When working through the project in a linear sequence, you may not understand how all parts fit together until the very end, leading to poor pipeline design. Building end-to-end, you can identify re-usable parts, make radical improvements and iterations at very lost cost. This strategy is particularly relevant for the work we are trying to do here. We are trying to achieve a number of competing goals and the thin-slice helps us to mediate between them. For example: @@ -26,62 +42,45 @@ Depending on which of these you want to prioritise - do it quickly, do it well, ## Thin-slice pipeline -In reality, we want some strategy that enables us to mediate between these competing demands to achieve all three outcomes - a high-quality pipeline that is developed quickly and with analysts feeling like they have improved their skills each time. - -The thin-slice strategy is the best way I've come up with to do this. It fits well with the support model the RAP team offers - where we work alongside a team who understand the data well. The typical workflow looks something like this: - -1. **Minimal outputs.** We identify some minimal outputs that we will try to replicate for the thin-slice. For example, we will reproduce the national numerator and denominator for one measure. - -2. **Reverse engineer the process.** We look at the existing code and work backwards to identify the minimal input data needed to calculate those outputs. - -3. **Replicate the target outputs.** We try to replicate those outputs as quickly as possible. At this phase we don't focus on code quality. Instead, we try to understand the logic of the process. The goal is just to recreate those numbers. Since we are only dealing with a small subset of the overall publication, we avoid getting totally swamped in complexity. Nevertheless - even this simplified thin-slice can be a substantial challenge. This phase forces us to grapple with a lot of the complexities of the process - e.g., funny joins, complex derivations, or logic that is spread across multiple sections of code. This is the most uncertain part of the process - but that is the whole point. We want to tackle the hard stuff up front. - -4. **Refactor and improve.** - Once we can accurately calculate the target outputs, we move into a more interesting, iterative mode. We step back and review the end-to-end data flow. We ask ourselves how it can be simplified or reorganised to be easier to maintain. We look for sections of code that could be made into reusable functions. We set up unit tests and regression tests where appropriate. We discuss on style and naming conventions. Eventually the whole team converges on a consensus. +In reality, we want some strategy that enables us to mediate between these competing demands to achieve all three outcomes - a high-quality pipeline that is developed quickly and with analysts feeling like they understand and own it. - - This **design phase** is iterative. We might review and improve the code several times before we decide to move forward. We often draw the analogy with writing the text for a publication. You would not publish the first draft of your publication - instead you go through several rounds of review and rewriting. Why would we treat our code any differently? +The thin-slice strategy balances these outcomes and fits well with the support model the RAP team offers - where we work alongside a team who understand the data well. The typical workflow looks something like this: - - In short - we give ourselves the time and space to make this thin-slice as good as we can possibly make it. Because this is such a small sliver of the overall project, it does not take long to make these improvements. Usually you only need to change a tens of lines of code rather than thousands. +### **Stage 1** +| Phases | Description | +| :------------: | ---------- | +| **Map out the processes** ![Treasure Map](../images/clipart_map.png)| Visually map out the current process by dissecting the existing code, identifying each step in the pipeline. Here we can pinpoint redundant elements of the process, obtaining quick wins when it comes to optimising our pipeline. Have a look at our [Process Mapping](../implementing_RAP/process_mapping.md) page for some guidance on how to achieve this. | +| **Identify the minimal outputs** ![Network with smallest element highlighted](../images/network.png)| Identify the minimal outputs that we will be replicating for the thin-slice. For example, we will reproduce the national numerator and denominator for one measure. | +| **Replicate the target minimal output** ![Image showing a document being duplicated](../images/copy_image.png)| Replicate those outputs as quickly as possible, focusing more on obtaining the numerical output than writing the most optimal code, getting to grips with the logic of the process. Since we are only dealing with a small subset of the overall publication, we avoid getting overwhelmed with the nuances of the pipeline.

This phase forces us to grapple with a lot of the complexities of the process - e.g., funny joins, complex derivations, or logic that is spread across multiple sections of code. By grappling with the tough parts early, we pave the way for a more efficient and robust development process.| - - By doing this design phase collaboratively, we gain a few very important benefits: +!!! tip + While replicating the minimal output we have regular talks with stakeholders to check this is following their needs and providing what is necessary. - - First, it gives both teams a chance to contribute different elements. The publication team bring an expert understanding why the process works a certain way while the RAP team bring experience of similar processes across multiple projects. - - Next, because the publication team is actively contributing to overall design decisions, they end up owning the code. They understand every decision that went into the final makeup of the code. This is totally different to the situation where a super-advanced coder writes some opaque, complex code and then just throws it over the fence. In that situation you are terrified to change anything in case you break it. - - Finally, because we all do this together, we emerge from this process with a shared understanding of how we are going to write code. - - - Once our improvements to the thin-slice pipeline start to tail off, we should think about how we expand the thin-slice to include all of the other fields and breakdowns in the publication. - -5. **Scaling out from the thin-slice.** - We now move out of the thin-slice phase to start building out the rest of the pipeline/publication. We try to choose the next fields strategically. E.g., we tackle a breakdown (a group-by) next, then a complex derived field, and then a field that requires joining some reference data. Again - the goal is to drive out any tricky bits as early in the process as possible. - - Once we are sure that the whole team is working effectively, we add each of the remaining fields and breakdowns to the backlog. Analysts pick up these tickets and tackle them in sequence. - - The tickets are considered complete when the new field gives the same outputs as the historic outputs and when another analyst has completed peer review on the code. - - This phase tends to go remarkably quickly. This is because (1) we already tackled all the hard bits up front and (2) everyone is working from a high-quality template - the thin-slice code. This is also a really good phase for cementing the learning in the team since now each person is working individually but repeating the same logic multiple times as they add lots of fields. +### **Stage 2** +| Phases | Description | +| :------------: | ---------- | +| **Identify the re-useable components and improve the thin-slice** ![Magnifying Glass](../images/magnifying_glass.png) |Once the target output has been achieved, we can turn our focus to improving how this is done. We do this with the principles described in our [Refactoring Guide](../training_resources/refactoring-guide.md). As this is just a sliver of the overall project it does not take long, but provides an excellent baseline from which to build the rest of the project.

It is essential that this is done collaboratively between members of the RAP team and the publication team. The publication team bring an expert understanding of why the process works a certain way, while the RAP team bring experience of having worked with similar processes across multiple projects. Because the publication team are actively contributing to the decisions, they end up with a feeling of ownership over the code and an understanding of how it works. | +| **Scale out the thin slice** ![Single flower to Bouquet](../images/flowers.png) |We now start tackling the rest of the outputs by expanding the thin-slice. This would be done strategically, perhaps tackling a breakdown first, then a complex derived field and then a field which requires joining some reference data.

The remaining fields and breakdowns are added to the backlog, and analysts pick up these tickets in turn. Tickets are considered done when the new field gives the same value as the historic output and the code has been peer-reviewed by another analyst.

This phase tends to go remarkably quickly. This is because we already tackled all the hard bits up front and everyone is working from the high quality thin-slice template that they helped to build. | --- -## FAQs +# -- What is the difference between a 'thin-slice' and an MVP? - - - The thin-slice is the minimal piece of functionality that allows us to build an end-to-end pipeline. This early phase of development is helpful for training a team and improving code quality. The thin-slice is therefore motivated by helping the delivery team to deliver effectively. It does not represent something to be delivered to customers. - - By contrast, a minimal viable product (MVP) is the most basic thing that a team could deliver to customers. MVP has a really peculiar meaning in the context of a pipeline migration since the 'minimal' thing to be delivered will typically be the full publication. +## FAQs +??? question "What is the difference between a 'thin-slice' and a minimal viable product (MVP)?" + The thin-slice denotes the essential functionality required to construct a complete end-to-end pipeline. This initial development phase serves as a valuable training opportunity for the team and contributes to enhancing the overall code quality. It is driven by the goal of enabling the delivery team to work efficiently and is not intended as a deliverable for customers. -- What happens if rebuilding the pipeline uncovers errors in the existing pipeline (and hence published statistics)? + By contrast, an MVP is the most basic thing that a team could deliver to customers. MVP has a really peculiar meaning in the context of a pipeline migration since the 'minimal' thing to be delivered will typically be the full publication. - - It is a near certainty that some minor errors will be discovered as part of any migration. This is particularly the case for building RAP pipelines since you are adding additional tests and safeguards to spot errors. +??? question "What happens if rebuilding the pipeline uncovers errors in the existing pipeline (and hence published statistics)?" + It is a near certainty that some minor errors will be discovered as part of any migration. This is particularly the case for building RAP pipelines since you are adding additional tests and safeguards to spot errors. We consider this a natural and important opportunity to improve the publication. We sit down with the team and discuss where the issue may have arisen and how we can prevent it in the future. Likewise, if there is an opportunity to make a substantial improvement to an existing publication we address that in collaboration with the team. We do not hold too tight to the idea of replicating historical outputs but instead aim for a pragmatic approach. -- What happens if we discover more problems after we have moved out of the thin-slice phase? - - - You will almost certainly discover more problems after the thin-slice phase. Hopefully the high code quality that comes from the thin-slice phase will make it more easy to resolve the problem, e.g., by refactoring the code. - -- How would this strategy work in a situation where you were working on a brand new project, rather than migrating a legacy pipeline? +??? question "What happens if we discover more problems after we have moved out of the thin-slice phase?" + You will almost certainly discover more problems after the thin-slice phase. However, the high code quality that comes from the thin-slice phase and the team understanding of how the code works will make it easier to resolve the problem, e.g., by refactoring the code. - - The same logic would apply. By identifying a very small piece of functionality and trying to implement it end-to-end, you will get a better sense of the problem. +??? question "How would this strategy work in a situation where you were working on a brand new project, rather than migrating a legacy pipeline?" + The same logic would apply. By identifying a very small piece of functionality and trying to implement it end-to-end, you will get a better sense of the problem and how to fix it. diff --git a/docs/our_RAP_service/typical-engagement-flow.md b/docs/our_RAP_service/typical-engagement-flow.md index 6f370ba..ba34ce8 100644 --- a/docs/our_RAP_service/typical-engagement-flow.md +++ b/docs/our_RAP_service/typical-engagement-flow.md @@ -1,4 +1,11 @@ -# Typical Engagement Flow +--- +title: Typical Engagement Flow + +tags: + - RAP engagements +--- + +# Many people ask about what a typical engagement with the RAP team at NHS England looks like. Since every team is different and since the service is still in a formative state, there really is no typical engagement. diff --git a/docs/site_info/acknowledgements.md b/docs/site_info/acknowledgements.md index f1ecc8a..964ca07 100644 --- a/docs/site_info/acknowledgements.md +++ b/docs/site_info/acknowledgements.md @@ -1,4 +1,12 @@ -# Acknowledgements +--- +title: Acknowledgements + +tags: + - RAP champions + - RAP CoP website +--- + +# It's taken a lot of work to make the NHS England RAP Community of Practice and further the cause of RAP within NHS England more generally. diff --git a/docs/site_info/rap-release-workflow.md b/docs/site_info/rap-release-workflow.md index d6ea2fd..4162ed7 100644 --- a/docs/site_info/rap-release-workflow.md +++ b/docs/site_info/rap-release-workflow.md @@ -1,4 +1,11 @@ -# RAP Release Workflow +--- +title: RAP Release Workflow + +tags: + - RAP CoP website +--- + +# !!! tip "TLDR" diff --git a/docs/site_info/repo_traffic_information.md b/docs/site_info/repo_traffic_information.md index c552fb4..225565e 100644 --- a/docs/site_info/repo_traffic_information.md +++ b/docs/site_info/repo_traffic_information.md @@ -1,4 +1,11 @@ -# GitHub Repository Traffic Statistics +--- +title: GitHub Repository Traffic Statistics + +tags: + - RAP CoP website +--- + +# We store the files that make up this website in a public GitHub repository, [RAP Community of Practice](https://github.com/NHSDigital/rap-community-of-practice), and GitHub has a feature that allows you to view and analyse the traffic coming into your repository. We have compiled this information and displayed it for you below. @@ -8,4 +15,4 @@ Be sure to hover over the graphs for more information and click and drag to zoom ## Average Views by Day of the Week - \ No newline at end of file + diff --git a/docs/site_info/website_traffic_information.md b/docs/site_info/website_traffic_information.md index 8a74e68..f73e3d3 100644 --- a/docs/site_info/website_traffic_information.md +++ b/docs/site_info/website_traffic_information.md @@ -1,3 +1,11 @@ +--- +title: Website traffic information +tags: + - RAP CoP website +--- + +# + ## How we collect data We use [Usermaven](https://usermaven.com/) as a cookie-less web traffic analytics tool to anonymously track our website vistors. This data helps us understand how the site is being used and how we can improve it. No personally identifiable information is stored by us or Usermaven and it is GDPR and CCPA compliant. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 69c7540..8800b18 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -20,6 +20,24 @@ margin: 0 0 1.25em; } +.md-typeset h1.header-with-tags { + margin-bottom: 6px; +} + +.md-typeset div[role="doc-subtitle"] { + font-size: 1.1em; + font-style: italic; + line-height: 1.3rem; + margin-bottom: 15px; + color: #888; +} + + +.md-typeset table:not([class]) th { + font-size: .85rem; + padding: .9375em 1em; +} + .md-footer { background-color: #f6f8f8 !important; } @@ -96,6 +114,132 @@ html { scroll-behavior: smooth; } +.md-typeset h2 { + font-size: 1.6rem; + /* font-weight: bold; */ + margin-bottom: 0em; + line-height: 1.2rem; +} + +.md-typeset h3 { + font-size: 1.5rem; + margin-bottom: 0em; + line-height: 1.2rem; +} + iframe { border:none; -} \ No newline at end of file +} + +.md-typeset .md-tags { + margin-bottom: 30px; +} + +.md-typeset .md-tags a.md-tag { + border-radius: 0px; + background-color: #005eb8; + color: white; + border: 2px #005eb8 solid; + transition: 0.4s; +} + +.md-typeset .md-tags:focus a.md-tag { + background-color: #fff; +} + +.md-typeset .md-tags a.md-tag:hover { + color: #005eb8; + background-color: white; +} +.align-centre { + display: block; + text-align: center; + margin-left: auto; + margin-right: auto; +} + + +p.pquote{ + position: relative; + --padding: 0.6rem; + padding: var(--padding); + color: var(--text-color, black); + font-weight: 600; + font-style: italic; + background-color: var(--bg-color, white); + display: grid; + gap: 1rem; + background-size: 0.5rem 0.5rem; + border-radius: 0.25rem; + box-shadow: 0.3rem 0.3rem 1rem rgb(0 0 0 / .5) +} + +p.pquote::before{ + --qHeight: 2rem; + content: ""; + margin-left: calc(var(--padding) * -.15); + margin-top: calc(var(--padding) * -.15); + height: var(--qHeight); + width: calc(var(--qHeight) * 1.1); + background-image: + radial-gradient( + circle at bottom right, + transparent calc(var(--qHeight) / 4 - 1px), + var(--accent-color, black) calc(var(--qHeight) / 4) calc(var(--qHeight) / 2), + transparent calc(var(--qHeight) / 2 + 1px) + ), + linear-gradient(var(--accent-color, black), var(--accent-color, black)); + background-size: calc(var(--qHeight) / 2) calc(var(--qHeight) / 2); + background-position: top left, bottom left; + background-repeat: space no-repeat +} + +p.pquote--card { + max-width: 25rem; + margin-inline: auto +} + +p.pquote--left { + max-width: 25rem; + text-align: left; + margin-inline: 0px +} + +p.pquote--card::before{ + --qHeight: 3rem; +} + +p.pquote--blue { + --bg-color: #41B6E6; + --accent-color: #003087; +} + +p.pquote--red { + --accent-color: red; +} + +p.pquote--serif{ + padding-top: 50px; +} + +p.pquote--serif::before{ + background: none; + content: "“"; + color: var(--accent-color); + display: block; + font-size: 4rem; + line-height: 4rem; + margin: none; + font-family: Serif; + font-style: normal; + position:absolute; + top: .4rem; + left: .4rem; +} + + +p.pquote--inline{ + padding-left: 3rem; + padding-top: 1.2rem; + padding-bottom: 1rem; +} diff --git a/docs/support.md b/docs/support.md index 720e6d6..8b78432 100644 --- a/docs/support.md +++ b/docs/support.md @@ -1,9 +1,13 @@ --- +title: Support + +tags: + - RAP engagements hide: - navigation --- -# Support +# If your team is embarking upon a RAP journey, you should understand [why RAP are important][1] and think about which [levels of RAP][2] that you want to aim for. diff --git a/docs/tags.md b/docs/tags.md new file mode 100644 index 0000000..45669e3 --- /dev/null +++ b/docs/tags.md @@ -0,0 +1,3 @@ +!!! info inline end + - Each page on this site is tagged + - Use this tag index to quickly find all the content we have on the topics you are most interested in! \ No newline at end of file diff --git a/docs/training_resources/R/README.md b/docs/training_resources/R/README.md index d567172..6d93909 100644 --- a/docs/training_resources/R/README.md +++ b/docs/training_resources/R/README.md @@ -1,9 +1,13 @@ --- -#hide table of contents whitespace +title: R + +tags: + - R + - R Studio hide: toc --- -# R +# We do not have many materials for R since the majority of our users use Python. diff --git a/docs/training_resources/R/git_with_RStudio.md b/docs/training_resources/R/git_with_RStudio.md index d129d02..d879074 100644 --- a/docs/training_resources/R/git_with_RStudio.md +++ b/docs/training_resources/R/git_with_RStudio.md @@ -1,5 +1,14 @@ -# Git with RStudio +--- +title: Git with RStudio +tags: + - R + - R Studio + - Git + - Version control +--- + +# ## Overview This page is intended as a starting point for R users who are new to using Git. It shows you the basic commands and the workflow to use Git on your own, in conjunction with using RStudio desktop or RStudio Cloud. @@ -290,4 +299,4 @@ If you are happy to merge the two branches, then select the type of Pull Request [3]: ../git/introduction-to-git.md#common-git-commands [4]: ../git/quick_start_guides/gitlab_quick_start_guide.md [5]: ../git/introduction-to-git.md#gitignore -[6]: ../git/quick_start_guides/gitlab_quick_start_guide.md \ No newline at end of file +[6]: ../git/quick_start_guides/gitlab_quick_start_guide.md diff --git a/docs/training_resources/git/git_walkthroughs/committing_work_walkthrough.md b/docs/training_resources/git/git_walkthroughs/committing_work_walkthrough.md index 61ff538..150499a 100644 --- a/docs/training_resources/git/git_walkthroughs/committing_work_walkthrough.md +++ b/docs/training_resources/git/git_walkthroughs/committing_work_walkthrough.md @@ -1,4 +1,12 @@ -# Committing Work +--- +title: Committing Work + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/git_walkthroughs/pull_and_merge_requests_walkthrough.md b/docs/training_resources/git/git_walkthroughs/pull_and_merge_requests_walkthrough.md index 6c2507f..3d38f5a 100644 --- a/docs/training_resources/git/git_walkthroughs/pull_and_merge_requests_walkthrough.md +++ b/docs/training_resources/git/git_walkthroughs/pull_and_merge_requests_walkthrough.md @@ -1,4 +1,12 @@ -# Pull and Merge Requests +--- +title: Pull and Merge Requests + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/git_walkthroughs/working_with_branches_walkthrough.md b/docs/training_resources/git/git_walkthroughs/working_with_branches_walkthrough.md index 2340567..79efc2b 100644 --- a/docs/training_resources/git/git_walkthroughs/working_with_branches_walkthrough.md +++ b/docs/training_resources/git/git_walkthroughs/working_with_branches_walkthrough.md @@ -1,4 +1,12 @@ -# Working with Branches +--- +title: Working with Branches + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/githooks.md b/docs/training_resources/git/githooks.md index 3249303..380eabf 100644 --- a/docs/training_resources/git/githooks.md +++ b/docs/training_resources/git/githooks.md @@ -1,4 +1,12 @@ -# Git Hooks +--- +title: Git Hooks + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/github_gitlab_terminology.md b/docs/training_resources/git/github_gitlab_terminology.md new file mode 100644 index 0000000..9e100bb --- /dev/null +++ b/docs/training_resources/git/github_gitlab_terminology.md @@ -0,0 +1,16 @@ +# GitHub and GitLab: Terminology + +[GitHub](/training_resources/git/quick_start_guides/github_quick_start_guide/) and [GitLab](/training_resources/git/quick_start_guides/gitlab_quick_start_guide/) are both web-based platforms designed to facilitate version control and collaboration on software development projects using the Git version control system. They differ mainly in terms of their feature sets, hosting options, and target audiences. + + +These differences also include the terminology used on each platform. Sometimes, especially if someone uses both GitLab and GitHub, they may mistakenly use a term common to one platform when talking about another and end up confusing those around them who are less familiar. We've put together a handy list of features both platforms offer under different names so users can easily understand the equivalent features between GitLab and GitHub, reducing confusion and facilitating smoother communication and collaboration across both platforms. + +| GitHub | GitLab | What does it mean? | +| -------- | ------- | ------- | +|**Merge request (MR)**|**Pull-request (PR)**|This is a way of combining two branches - often a "working branch" and the "main" branch. It does this in a clever way that allows you to weave the changes into the original.| +|**Repository**|**Projects**|This is a code "repository" in which you will place the code for a single project / product / piece of work. It's probably the most important part of GitHub/Lab.| +|**Organisation**|**Groups**|These enable users to organise repositories, manage access permissions, and facilitate collaboration among teams or communities, albeit with different emphases and additional features.| +|**Collaborator**|**Member**|Users with explicit access to a repository or project, enabling them to contribute and manage content.| +|**Action**|**CI/CD**|This is a way of automating your codebase - i.e. you can make it so that after each pull-request, your code gets tested, or that it gets shipped out somewhere, or that it gets deployed to a production space. The sky is the limit!| +|**Package Registry**|**Packages & Registries**|This is a way of storing a compressed and ready-to-roll version of your codebase called a "package".| +|**Gists**|**Snippets**|Allows for the sharing of single files or snippets of code, often for demonstration or quick sharing purposes.| \ No newline at end of file diff --git a/docs/training_resources/git/introduction-to-git.md b/docs/training_resources/git/introduction-to-git.md index 76a1e15..9f07578 100644 --- a/docs/training_resources/git/introduction-to-git.md +++ b/docs/training_resources/git/introduction-to-git.md @@ -1,4 +1,14 @@ -# Introduction to Git +--- +title: Introduction to Git + +tags: + - Git + - Version control + - GitHub + - GitLab +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/making-code-discoverable.md b/docs/training_resources/git/making-code-discoverable.md index 0a512fd..7ae2d55 100644 --- a/docs/training_resources/git/making-code-discoverable.md +++ b/docs/training_resources/git/making-code-discoverable.md @@ -1,4 +1,13 @@ -# Making Code Discoverable using Github Topics. +--- +title: Making Code Discoverable using Github Topics. + +tags: + - Git + - Version control + - GitHub topics +--- + +# [comment]: <> (this is a mkdocs material style admonition - it will look better on the website) !!! tip "TLDR" @@ -55,4 +64,4 @@ When applying topics to your code: ## Using topics to find useful repos (and code) -You can search for repos by topic within github using the search bar (e.g., [as seen here](https://github.com/search?q=topic%3Anhs&type=repositories), with tips on github search syntax [here](https://docs.github.com/en/search-github/github-code-search/understanding-github-code-search-syntax)) or [you can use this helpful website](https://nhsengland.github.io/open-health-statistics/github-topics.html) which gathers the repos and topics from the various NHS organisations on GitHub. \ No newline at end of file +You can search for repos by topic within github using the search bar (e.g., [as seen here](https://github.com/search?q=topic%3Anhs&type=repositories), with tips on github search syntax [here](https://docs.github.com/en/search-github/github-code-search/understanding-github-code-search-syntax)) or [you can use this helpful website](https://nhsengland.github.io/open-health-statistics/github-topics.html) which gathers the repos and topics from the various NHS organisations on GitHub. diff --git a/docs/training_resources/git/quick_start_guides/git_quick_start_guide.md b/docs/training_resources/git/quick_start_guides/git_quick_start_guide.md index 5126b64..6411273 100644 --- a/docs/training_resources/git/quick_start_guides/git_quick_start_guide.md +++ b/docs/training_resources/git/quick_start_guides/git_quick_start_guide.md @@ -1,4 +1,12 @@ -# Git Quick Start Guide +--- +title: Git Quick Start Guide + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/quick_start_guides/github_quick_start_guide.md b/docs/training_resources/git/quick_start_guides/github_quick_start_guide.md index e6e65d8..aeb76a5 100644 --- a/docs/training_resources/git/quick_start_guides/github_quick_start_guide.md +++ b/docs/training_resources/git/quick_start_guides/github_quick_start_guide.md @@ -1,4 +1,13 @@ -# GitHub Quick Start Guide +--- +title: GitHub Quick Start Guide + +tags: + - Git + - Version control + - GitHub +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/git/quick_start_guides/gitlab_quick_start_guide.md b/docs/training_resources/git/quick_start_guides/gitlab_quick_start_guide.md index 1fa861f..32850f6 100644 --- a/docs/training_resources/git/quick_start_guides/gitlab_quick_start_guide.md +++ b/docs/training_resources/git/quick_start_guides/gitlab_quick_start_guide.md @@ -1,4 +1,13 @@ -# GitLab Quick Start Guide +--- +title: GitLab Quick Start Guide + +tags: + - Git + - Version control + - GitLab +--- + +# !!! info diff --git a/docs/training_resources/git/using-git-collaboratively.md b/docs/training_resources/git/using-git-collaboratively.md index c4a41ea..f705d95 100644 --- a/docs/training_resources/git/using-git-collaboratively.md +++ b/docs/training_resources/git/using-git-collaboratively.md @@ -1,4 +1,12 @@ -# Using Git Collaboratively +--- +title: Using Git Collaboratively + +tags: + - Git + - Version control +--- + +# !!! tip "TLDR" diff --git a/docs/training_resources/pyspark/logging-and-error-handling.md b/docs/training_resources/pyspark/logging-and-error-handling.md index 7c4a3a0..a422a32 100644 --- a/docs/training_resources/pyspark/logging-and-error-handling.md +++ b/docs/training_resources/pyspark/logging-and-error-handling.md @@ -1,9 +1,20 @@ --- -#hide table of contents whitespace -hide: toc +title: Logging and error handling in PySpark + +tags: + - PySpark + - Logging + - Error handling + - Spark + - Coding tips + +hide: + - toc --- -# Logging and error handling in PySpark +# + + !!! Info See our documentation on [logging and error handling in Python][1] to find out more. diff --git a/docs/training_resources/pyspark/pyspark-style-guide.md b/docs/training_resources/pyspark/pyspark-style-guide.md index bff7602..b95040e 100644 --- a/docs/training_resources/pyspark/pyspark-style-guide.md +++ b/docs/training_resources/pyspark/pyspark-style-guide.md @@ -1,4 +1,18 @@ -# PySpark style guide +--- +title: PySpark style guide + +tags: + - PySpark + - Spark + - Coding tips + - Code style + - SQL + +hide: + - toc +--- + +# ## Introduction @@ -230,11 +244,7 @@ def group_by_and_count_column(data: df, column_name: str): Returns: groups from column and count “”” - # Group by CCG Code and count number of records per CCG - df_count = df.groupBy(df.column_name).count() - # sort by CCG Code descending order - result = df_count.sort(desc("count")) return result ``` diff --git a/docs/training_resources/python/backtesting.md b/docs/training_resources/python/backtesting.md index b10b595..77c0258 100644 --- a/docs/training_resources/python/backtesting.md +++ b/docs/training_resources/python/backtesting.md @@ -1,5 +1,13 @@ -# Back testing +--- +title: Back testing +tags: + - Testing + - Backtesting + - Quality assurance +--- + +# ## What is back testing and why do I care? Now that you are writing code in a reproducible manner, and perhaps using Python instead of another language, it is important that the code still produces the same results as the old code. Mistakes can easily be made in translating from one code base to another. diff --git a/docs/training_resources/python/basic-python-data-analysis-operations.md b/docs/training_resources/python/basic-python-data-analysis-operations.md index 9fbf681..60411ce 100644 --- a/docs/training_resources/python/basic-python-data-analysis-operations.md +++ b/docs/training_resources/python/basic-python-data-analysis-operations.md @@ -1,4 +1,14 @@ -# Basic Python Data Analysis operations +--- +title: Basic Python Data Analysis operations + +tags: + - Python + - Data analysis + - Pandas + - Numpy +--- + +# Python offers many ways to achieve multiple calculations, computations and operations. For data analysis and data science overall, [Pandas](https://pandas.pydata.org/) is the most commonly used package or library to perform these operations, along with [NumPy](https://numpy.org/). @@ -18,8 +28,6 @@ df - dataframe ```py df = pd.read_csv('your_file.csv') -# or if required to edit headers for example: - df = pd.read_csv('your_file.csv', header=..., na_values=..., sep=..., etc) ``` @@ -76,8 +84,6 @@ You will soon notice after importing your data from the .sav file that the colum ```py df.columns = df.columns.str.lower() -# or -df.columns = df.columns.str.upper() ``` ### Extracting the required columns @@ -85,11 +91,7 @@ df.columns = df.columns.str.upper() To select a column: ```py -# columns to keep -to_keep = ["column 1", "column 2", "column 3", ...] -# create the new table -filtered_df = df[to_keep] ``` ### Filter where a variable is not null/missing @@ -97,38 +99,24 @@ filtered_df = df[to_keep] To filter rows based on some specific criteria: ```py -# not null -new_df = df[df["my_column"].notnull()] ``` ### Joins ```py -# a left join, one column to join on -joined_df = df.merge(other_df, how="left", on="my_column") -# inner join, on multiple columns -joined_df = df.merge(other_df, how="inner", on=["column 1", "column 2"]) ``` ### Add a new column ```py -# create new table with a new column that adds 5 to each value of another selected column -new_df = df.assign(new_column=df["my column"] + 5) ``` ### Sorting variables ```py -# ascending order can be False or True -df.sort_values(by="my column", ascending=False) -# if you want to see missing values first, assign na_position -df.sort_values(by="my column", ascending=False, na_position="first") -# sort by multiple columns -df.sort_values(by=["my column 1", "my column 2", ...]) ``` ### Transposing columns @@ -136,14 +124,8 @@ df.sort_values(by=["my column 1", "my column 2", ...]) There's a few ways to transpose columns: ```py -# set the index of columns -df.set_index(["my column 1", "my column 2", "my column 3", ...], inplace=True) -# using pandas transpose to transpose rows with columns and vice versa -df_transposed = df.T -# using pandas stack() to transpose non-index columns into a single new column -df = df.stack().reset_index() ``` To set the name of the axis for the index or columns you can use `rename_axis()`: @@ -155,16 +137,10 @@ df = df.stack().rename_axis().reset_index() ### Grouping by variables ```py -# group by one column -new_df = df.groupby("my_column") - -# group by multiple columns # list of columns to group by grouped = ["column 1", "column 2", "column 3", ...]] -# return new table with grouped columns -new_df = df.groupby(grouped) ``` ### Aggregations @@ -178,12 +154,8 @@ new_df = df.groupby(grouped).agg(total_sum=("column to be summarised", "sum"), t ### Creating totals per row and per column ```py -# total per column, adds a new row "Column Total" -# this will sum all numeric row values for each column df.loc["Column Total"] = df.sum(numeric_only=True, axis=0) -# total per row, creates a new column "Row Total" -# this will sum all numeric column values for each row df.loc[:, "Row Total"] = df.sum(numeric_only=True, axis=1) ``` @@ -192,14 +164,8 @@ df.loc[:, "Row Total"] = df.sum(numeric_only=True, axis=1) When creating different aggregations/groupings which are saved in different dataframes, you can then combine these aggregations into one table. For example, suppose you have calculated the totals for age and gender in different dataframes and you wish to append these results to the final output dataframe. ```py -# list the final output dataframe to store its aggregations -list_df = [df] -# append the calculated totals -list_df.append(calc_totals_df) -# concatenate into a single dataframe -output_df = pd.concat(list_df, axis=0) ``` ### Creating derivations @@ -207,8 +173,6 @@ output_df = pd.concat(list_df, axis=0) To create a derivation based on the equivalent CASE WHEN SQL operation, there are several ways to do this in python: ```py -# pandas package CASE WHEN -# create the age 11 to 15 derivation df.loc[df["age"] < 0, "age11_15"] = df["age"] df.loc[(df["age"] > 0) & (df["age"] < 11), "age11_15"] = 11 df.loc[(df["age"] > 10) & (df["age"] < 16), "age11_15"] = df["age"] @@ -218,8 +182,6 @@ df.loc[df["age"] > 14, "age11_15"] = 15 This results in creating a new column "age11_15" in the existing dataframe, based on the CASE WHEN conditions we applied for the new derivation. ```py -# NumPy package CASE WHEN -# create the age 11 to 15 derivation age11_15 = np.select( [ df['age'] == 10, # WHEN @@ -232,8 +194,6 @@ age11_15 = np.select( default=df['age'] # ELSE assign "age" column values ) -# assign the result to a new column -df["age11_15"] = age11_15 ``` In the first bracket you assign the "WHEN" part of the condition, second bracket the "THEN", and "default=..." represents the "ELSE" part. @@ -243,24 +203,14 @@ The NumPy option is faster and more efficient whereas Pandas is user friendlier ### Apply a column order ```py -# create a list of the column headers in a specific order -column_order = ["column 1", "column 2", "column 3", ...] -# apply list to dataframe -df = df[column_order] ``` ### Exporting the output ```py -# write output to a .csv -df.to_csv("output.csv", ... ) -# write output to an excel workbook -df.to_excel("output.xlsx", sheet_name="Sheet_name_1", ... ) -# write multiple sheets from different dataframes -with pd.ExcelWriter("output.xlsx") as writer: df1.to_excel(writer, sheet_name="Sheet_name_1") df2.to_excel(writer, sheet_name="Sheet_name_2") ``` diff --git a/docs/training_resources/python/config-files.md b/docs/training_resources/python/config-files.md index b025782..a93a6bd 100644 --- a/docs/training_resources/python/config-files.md +++ b/docs/training_resources/python/config-files.md @@ -1,4 +1,14 @@ -# Using Config Files +--- +title: Using Config Files + +tags: + - Python + - Coding tips + - Config files + - Project structure +--- + +# !!! tip "TLDR" @@ -178,8 +188,6 @@ elif config['report_type'] == 'monthly': df_report_data = get_monthly_data(config) ``` -# Over to you - As you were reading through this, did any ideas pop into your head about your own projects? Any values you keep having (or forgetting!) to change? Any bits of code you sometimes need to comment out? If so, you've got a prime candidate for using a config file! So give it a try - implement the above steps in your project and see what you think. Good luck! diff --git a/docs/training_resources/python/handling-file-paths.md b/docs/training_resources/python/handling-file-paths.md index 63bcddf..aa6659a 100644 --- a/docs/training_resources/python/handling-file-paths.md +++ b/docs/training_resources/python/handling-file-paths.md @@ -1,4 +1,15 @@ -# Handling file paths +--- +title: Handling file paths + +tags: + - Python + - Coding tips + - Pathlib + - File paths + - Pandas +--- + +# ## What is pathlib? @@ -45,19 +56,13 @@ operations For example, you can access the current working directory with the `cwd` attribute. ```python -# Print the current working directory (cwd) -print("CWD:", pathlib.Path.cwd()) ``` Pass strings to Path constructor to create a Path object ```python -# . is the current directory -cwd_path = pathlib.Path(".") print("CWD (again):", cwd_path) -# Use resolve to get the absolute path! -cwd_abspath = cwd_path.resolve() print("Absolute CWD:", cwd_abspath) ``` @@ -69,8 +74,6 @@ The following examples show how pathlib makes it easier to extract specific attr #### Example: absolute path to the current file ```python -# Note: __file__ is a global Python variable -this_file_path = pathlib.Path(__file__) print("Path to file:", this_file_path) ``` @@ -152,8 +155,6 @@ import pandas as pd import pyreadstat # needed to parse sav files in spss import pathlib2 # This is just a backwards compatible pathlib! -# https://realpython.com/python-pathlib/ - # Add parameters BASE_DIR = pathlib2.Path(r"\\\Publication\RAP") PUPIL_DIR = BASE_DIR / "Inputs" / "PupilData" diff --git a/docs/training_resources/python/intro-to-python.md b/docs/training_resources/python/intro-to-python.md index 3ed5ff4..f7e0acd 100644 --- a/docs/training_resources/python/intro-to-python.md +++ b/docs/training_resources/python/intro-to-python.md @@ -1,4 +1,15 @@ -# Intro to Python +--- +title: Intro to Python + +tags: + - Coding tips + - Python + - Virtual environments + - VS Code + - PySpark +--- + +# !!! tip "TLDR" - Python is a general-purpose, open-source programming language, good for data analysis, available on many data platforms diff --git a/docs/training_resources/python/logging-and-error-handling.md b/docs/training_resources/python/logging-and-error-handling.md index f555013..53890d8 100644 --- a/docs/training_resources/python/logging-and-error-handling.md +++ b/docs/training_resources/python/logging-and-error-handling.md @@ -1,4 +1,15 @@ -# Logging and error handling +--- +title: Logging and error handling + +tags: + - Coding tips + - Python + - Error handling + - Logging + - PySpark +--- + +# Logging and error handling are two concepts that will improve the reliability and maintainability of your code. These big topics could each be given their own chapter but here we try to show how the combination of simple logging and simple error handling can be easy to implement while offering substantial benefits. @@ -102,8 +113,6 @@ This is bad practice as instead of handling the specific errors the code could t ```python try: - # Some problematic code that could raise different kinds of exceptions -except ValueError as e: print('Found a value error!') print(repr(e)) exit() @@ -121,8 +130,6 @@ Alternatively if we really did want to handle all of those exceptions in the sam ```python try: - # Some problematic code that could raise different kinds of exceptions -except (ValueError, ZeroDivisionError, KeyError) as e: print('Found an error!') print(repr(e)) exit() @@ -165,8 +172,6 @@ As a general rule of thumb avoid using the generic `Exception` class at all. It ```python try: - # Some problematic code that could raise different kinds of exceptions -except Exception: print('Found an error!') exit() ``` @@ -200,8 +205,6 @@ def divide_two_numbers(a: float, b: float) -> float: print('Division failed because of: ' + repr(e)) raise ZeroDivisionError -# In use: -a = 1.0 b = 0 try: result = divide_two_numbers(a, b) @@ -220,8 +223,6 @@ Doing this raises a new ZeroDivisionError, which loses the stack trace of the or ```python except ZeroDivisionError: - # Do stuff - raise ``` #### Don't let the program continue if it can't diff --git a/docs/training_resources/python/project-structure-and-packaging.md b/docs/training_resources/python/project-structure-and-packaging.md index 5782e5a..1fc28ed 100644 --- a/docs/training_resources/python/project-structure-and-packaging.md +++ b/docs/training_resources/python/project-structure-and-packaging.md @@ -1,5 +1,15 @@ -# Project and Package structuring - +--- +title: Project and Package structuring + +tags: + - Project structure + - Python + - Packaging + - Pip + - Anaconda +--- + +# !!! tip "TLDR" - Generally you should use a standard repo structure - this page describes how and why - [RAP Python Package template](https://github.com/NHSDigital/rap-package-template) comes with sections for the different bits of your code, testing and prepares you for making your code into a Python package. diff --git a/docs/training_resources/python/python-functions.md b/docs/training_resources/python/python-functions.md index a4b51bb..c3b5e67 100644 --- a/docs/training_resources/python/python-functions.md +++ b/docs/training_resources/python/python-functions.md @@ -1,4 +1,15 @@ -# Python functions +--- +title: Python functions + +tags: + - Python + - Functions + - Docstrings + - Coding tips + - PySpark +--- + +# > We are currently updating our minimal Python RAP package template, which is freely available to use via Github: [RAP package template](https://github.com/NHSDigital/rap-package-template). @@ -11,13 +22,9 @@ In Python, the standard library provides many useful built-in function such as l Python functions are defined by using the `def` keyword. Here is an example of a Python function which converts the argument `temp` which is a value representing a temperature in Fahrenheit, into Celsius: ```python - # Define the function - def fahrenheit_to_celsius(temp): new_temp = (temp - 32) * (5/9) return new_temp - # Use the function - fahrenheit_to_celsius(77) >>> 25.0 ``` @@ -66,20 +73,14 @@ For example, we take the following function to implement bubble sort. A sorting If we had three arrays we wanted to sort (arr1, arr2, arr3) and we didn't use a function we would have to repeat our code three times. This produces messy code that is hard to read. Can you spot the errors in the code below? ```python - # Sort array 1 - n = len(arr1) for i in range(n-1): for j in range(0, n-i-1): if arr1[j] > arr1[j + 1] : arr1[j], arr1[j + 1] = arr1[j + 1], arr1[j] - # Sort array 2 - n = len(arr2) for i in range(m-1): for j in range(0, n-i-1): if arr2[j] > arr2[j + 1] : arr2[i], arr2[j + 1] = arr2[j + 1], arr2[j] - # Sort array 3 - n = len(arr3) for j in range(n-1): for i in range(0, n-i-1): if arr3[j] > arr3[j + 1] : @@ -89,12 +90,8 @@ If we had three arrays we wanted to sort (arr1, arr2, arr3) and we didn't use a Writing repetitive non-functional code is bad practice. Copy pasting processes like this and changing variable names can result in mistakes and will make a project excessively verbose. By using the defined function before, all of these issues are fixed with no functionality lost: ```python - # Sort array 1 - bubblesort(arr=arr1) # Sort array 2 bubblesort(arr=arr2) - # Sort array 3 - bubblesort(arr=arr3) ``` Re-usability is not the only benefit of functions, and just because a process is only done once does not mean that it should not be placed inside the body of a function instead. @@ -120,22 +117,14 @@ cholesterol_numerator = ( Using functions can better elucidate the steps of a process. A long process typically has several logical steps to it each of which can be delivered by a block of code. If we attempt to write out all the steps of the process in a long sequence of code this will be very difficult to read, and identifying the separate steps of the process is difficult. Below is an example from the diabetes RAP that uses functions, functions aren't being used here to cut down on code re-use but rather to make the steps of the delivered process more clear: ```python - # Step 3: Identify the best record for each person - best_record = identify_best_record(record_scores=record_scores) # Step 4: Use the best record to cut down the record_scores table, creating the golden record golden_record = create_golden_record_table(record_scores=record_scores, best_record=best_record) - # Step 5: Derive additional fields for the golden record table - enriched_golden_record = enrich_golden_record(golden_record=golden_record, hes_diabetes=hes_diabetes, imd_scores=imd_scores) - # Step 6: Output the golden record table to SQL so we can avoid recalculating next time - write_df_to_SQL(df_to_write = enriched_golden_record, target_table = golden_record_sql, mode = 'overwrite', database = params['work_db']) - # Step 7: after attain the golden_record (either by saved table or generate new one), we build final table - final_table = produce_aggregates(golden_record) ``` It is important to note that overuse of functions like this can be a bad thing. Breaking down a process into too many functions will make code harder to read. An extreme example can demonstrate this: @@ -165,20 +154,14 @@ This code delivers a function bubblesort which has the same functionality as the Another benefit is that using functions allows us to adapt to new changes through the use of arguments. For example lets say in our initial bubble sort example it was decided that calling the arrays `arr1`, `arr2`, and `arr3` was not descriptive enough. The lead on the project decides that instead these arrays should be renamed to `array1`, `array2`, and `array3`. Without using a function we would have to change every occurrence of these variables in our long messy code: ```python -# Sort array 1 -n = len(array1) for i in range(n-1): for j in range(0, n-i-1): if array1[j] > array1[j + 1] : array1[j], array1[j + 1] = array1[j + 1], array1[j] -# Sort array 2 -n = len(array2) for i in range(n-1): for j in range(0, n-i-1): if array2[j] > array2[j + 1] : array2[j], array2[j + 1] = array2[j + 1], array2[j] -# Sort array 3 -n = len(array3) for i in range(n-1): for j in range(0, n-i-1): if array3[j] > array3[j + 1] : @@ -188,24 +171,16 @@ for i in range(n-1): This is bad practice. It is time consuming and could also result in mistakes. If instead we used a defined function we would only have to change the name of the variable passed as an argument: ```python -# Sort array 1 -bubblesort(arr=array1) # Sort array 2 bubblesort(arr=array2) -# Sort array 3 -bubblesort(arr=array3) ``` Another example could be seen in the diabetes RAP. Lets say that in step 5 we instead want to use some `new_hes_diabetes` data, we can simply update that one argument without having to rewrite the whole logical step of the process: ```python -# OLD Step 5: Derive additional fields for the golden record table -golden_record = enrich_golden_record(golden_record=golden_record, hes_diabetes=hes_diabetes, imd_scores=imd_scores) -# NEW Step 5: Derive additional fields for the golden record table -golden_record = enrich_golden_record(golden_record=golden_record, # change: hes_diabetes=new_hes_diabetes, imd_scores=imd_scores) diff --git a/docs/training_resources/python/unit-testing-field-definitions.md b/docs/training_resources/python/unit-testing-field-definitions.md index ac3812b..a710b45 100644 --- a/docs/training_resources/python/unit-testing-field-definitions.md +++ b/docs/training_resources/python/unit-testing-field-definitions.md @@ -1,4 +1,15 @@ -# Unit testing field definitions +--- +title: Unit testing field definitions + +tags: + - Unit testing + - Testing + - Field Definitions + - Python + - PySpark +--- + +# This section focuses on one specific application of unit testing that is very relevant to analysts; testing the definitions of different fields (columns). One of the biggest burdens on stats teams is to maintain accurate definitions of fields over time as the specifications change. This drift in definitions is also one of the biggest sources of errors in stats publications. In the worst cases you may find that a field is defined in dozens of locations across a code-base. Each of these needs to be updated each time a change happens - leading to burden and risk. diff --git a/docs/training_resources/python/unit-testing.md b/docs/training_resources/python/unit-testing.md index d4db64c..826aabf 100644 --- a/docs/training_resources/python/unit-testing.md +++ b/docs/training_resources/python/unit-testing.md @@ -1,4 +1,15 @@ -# Unit testing +--- +title: Unit testing + +tags: + - Testing + - Unit testing + - Test Driven Development (TDD) + - Python + - PySpark +--- + +# Tests are functions which make logical assertions. If all assertions are correct then the test passes, if at least one assertion is incorrect then the test fails. Tests are a useful metric for deciding if an application has met its requirements. diff --git a/docs/training_resources/python/using-f-strings-sql-queries.md b/docs/training_resources/python/using-f-strings-sql-queries.md index 9ecc233..5a3e847 100644 --- a/docs/training_resources/python/using-f-strings-sql-queries.md +++ b/docs/training_resources/python/using-f-strings-sql-queries.md @@ -1,4 +1,13 @@ -# Using Python f-strings to run SQL queries +--- +title: Using Python f-strings to run SQL queries + +tags: + - Python + - SQL + - Pandas +--- + +# !!! tip "TLDR" - [Python can be used to run SQL strings](#parametrising-sql-queries-using-python-f-strings), and they can be parametrised using [f-strings](#option) to make "dynamic SQL" @@ -54,8 +63,6 @@ import sqlalchemy as sa # read in sqlalchemy package import pandas as pd # read in pandas import pyodbc # import odbc drivers for SQL -# Python function -def get_df_from_sql(query, server, database) -> pd.DataFrame: """ Use sqlalchemy to connect to the NHSD server and database with the help of mssql and pyodbc packages @@ -96,8 +103,6 @@ def create_sql_query(database, table, year, code_value) -> str: ``` Notice how we inserted a standard SQL query into a Python f-string format and assigned it to the variable `query`. The when we call this function, all we need to do is insert the arguments for `database`, `table`, `year` and `code_value` and assign the result to a `variable`. ```py -# call function and assign to variable for arguments: database = NHS, table = patients, year = 2020, code_value = 3) -sql_query_for_publication = create_sql_query("NHS", "patients", "2020", "3") ``` Now that we have our query loaded up with our chosen parameters, we can go ahead and create our `Pandas` dataframe: ```py @@ -135,11 +140,7 @@ def read_in_sql_query(sql_file_path: str, **sql_parameters) -> str: ``` We can call this function to create our query, from the `.sql` file: ```py -# location of sql folder containing SQL scripts -sql_file_path = r'src\sql_scripts\my_sql_query.sql' -# call function -sql_parameters = {database: "NHS", table: "patients", year: "2020", code_value: "3"} sql_query_for_publication = read_in_sql_file(sql_file_path, **sql_parameters) ``` We can reuse the get_df_from_sql() function from Option 1 to read in the data and create a `Pandas` dataframe: @@ -199,39 +200,23 @@ When run in SQL Server, the stored procedure above will produce an output contai ```py import pyodbc as po -# Connection variables -server = '' database = '' username = '' # username and password might not be required to connect password = '' -# this can be wrapped in a fuction like in the example above using the variables server, database, username, password. -try: # Connection string cnxn = po.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=' + server+';DATABASE='+database+';UID='+username+';PWD=' + password) cursor = cnxn.cursor() - # Prepare the stored procedure execution script and parameter values - storedProc = "Exec PatientsNHSERegion @PatientID = ?, @NHSECode = ?" params = ("1234590", 5150) - # Execute Stored Procedure With Parameters - cursor.execute(storedProc, params) - # Iterate the cursor - row = cursor.fetchone() while row: - # Print the row - print(str(row[0]) + " : " + str(row[1] or '') ) row = cursor.fetchone() - # Close the cursor and delete it - cursor.close() del cursor - # Close the database connection - cnxn.close() except Exception as e: print("Error: %s" % e) diff --git a/docs/training_resources/python/virtual-environments/conda.md b/docs/training_resources/python/virtual-environments/conda.md index 840ebbe..d14e78f 100644 --- a/docs/training_resources/python/virtual-environments/conda.md +++ b/docs/training_resources/python/virtual-environments/conda.md @@ -1,4 +1,13 @@ -# Conda environment +--- +title: Conda environment + +tags: + - Python + - Virtual environments + - Anaconda +--- + +# Conda is the environment manager (and package manager) bundled with Anaconda, which is a commonly used distribution of Python, R and other applications. See [these instructions][install-anaconda] on how to install and setup Anaconda. Python packages (and other applications) in Conda are curated by the Anaconda team, however, there is also "conda-forge" which is a "channel" of their package repository which is managed by "the community", i.e. whoever made the packages. diff --git a/docs/training_resources/python/virtual-environments/venv.md b/docs/training_resources/python/virtual-environments/venv.md index b5f2157..5c95556 100644 --- a/docs/training_resources/python/virtual-environments/venv.md +++ b/docs/training_resources/python/virtual-environments/venv.md @@ -1,4 +1,13 @@ -# Venv environment +--- +title: Venv environment + +tags: + - Python + - Virtual environments + - Anaconda +--- + +# !!! warning diff --git a/docs/training_resources/python/virtual-environments/why-use-virtual-environments.md b/docs/training_resources/python/virtual-environments/why-use-virtual-environments.md index d766109..6341612 100644 --- a/docs/training_resources/python/virtual-environments/why-use-virtual-environments.md +++ b/docs/training_resources/python/virtual-environments/why-use-virtual-environments.md @@ -1,4 +1,14 @@ -# Virtual environments +--- +title: Virtual environments + +tags: + - Python + - Virtual environments + - Pip + - Anaconda +--- + +# ## What are virtual environments? diff --git a/docs/training_resources/python/visualisation-in-python.md b/docs/training_resources/python/visualisation-in-python.md index 16a8105..26fc849 100644 --- a/docs/training_resources/python/visualisation-in-python.md +++ b/docs/training_resources/python/visualisation-in-python.md @@ -1,4 +1,16 @@ -# Visualisations in Python +--- +title: Visualisations in Python + +tags: + - Data visualisation + - Python + - Matplotlib + - Accessibility + - Pandas + - Numpy +--- + +# Creating visualisations can require a lot of effort. @@ -25,15 +37,11 @@ Using the famous iris dataset we can produce a scatter plot of the sepal length from matplotlib import pyplot as plt import seaborn as sns -# View a list of pre-loaded Seaborn datasets -for dataset in sns.get_dataset_names(): print(dataset) df = sns.load_dataset("iris") # we import iris data print(df.head(3)) # view first 3 rows of the data -# Create a scatter plot -plt.scatter(df['sepal_length'], df['sepal_width'], color='green', marker='s') plt.xlabel('Sepal length') # x axis title/label plt.ylabel('Sepal width') # y axis title/label plt.show() @@ -44,8 +52,6 @@ plt.show() Notice how we set the colour of the data points (color parameter) and the shape (marker parameter = square). To edit the transparency degree (alpha parameter) of the points: ``` -# Edit data points transparency with the parameter alpha -plt.scatter(df['sepal_length'], df['sepal_width'], alpha=0.2) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.show() @@ -72,18 +78,12 @@ df1 = pd.read_csv(path_1) # Create a Pandas dataframe After loading our Python packages and dummy data we can start working on producing the first plot: ``` -# Create a line chart (Plot 1) -plt.figure(figsize=(10, 5)) # set the figure size plt.plot(df1['Year'], df1['Percent'], label="Sepal", linewidth=2, linestyle='-') # create the plot -# Define labels and ticks -plt.ylabel("Percent", loc='top', rotation="horizontal") # y label title and location plt.xticks(np.arange(1982, 2020, step=2)) # x axis ticks range plt.yticks(np.arange(0, 70, step=10)) # y axis ticks range plt.grid(axis='y') # opting for y axis gridlines -# Create annotations (done here for one annotation to avoid redundancy) -plt.annotate('Jan 2003: Large new health warnings on cigarette packs', xy=(2003, 42), xytext=(2003, 52), size=9, bbox=dict(boxstyle="square", fc='0.95', pad=1, ec="none"), arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=6)) plt.box(False) # remove outer borders @@ -103,13 +103,9 @@ The advantages of a .svg file compared to using a .png file is better outlined t Similarly, to create the third plot (horizontal bar chart) on the publication webpage: ``` -# Create the Pandas dataframe -percent_y = [2, 7, 14, 22, 31] index = ['11 years', '12 years', '13 years', '14 years', '15 years'] df = pd.DataFrame({'Percent': percent_y}, index = index) -# Plot the horizontal bar chart -ax = df.plot.barh() # plot the chart ax.invert_yaxis() # invert the y axis plt.xlabel("Percent", loc='right', rotation="horizontal") # place the x label ax.get_legend().remove() # remove the unnecessary legend diff --git a/docs/training_resources/refactoring-guide.md b/docs/training_resources/refactoring-guide.md index bc1e710..5ce6a0a 100644 --- a/docs/training_resources/refactoring-guide.md +++ b/docs/training_resources/refactoring-guide.md @@ -1,13 +1,23 @@ -Note: this is temporarily sitting here until we decide on its future home +--- +title: Refactoring Golden rules + +tags: + - Coding tips + - Loose coupling + - Project structure + - Refactoring +--- -# Refactoring Golden rules +# + +Note: this is temporarily sitting here until we decide on its future home !!! tip "TLDR" - Refactoring means improving your code without changing what it does. - - Your aim is to make the code more readable, more maintabalbe, more reusable, and more efficient. + - Your aim is to make the code more readable, more maintainable, more reusable, and more efficient. - If you don't refactor your code, it can become increasingly more difficult and time-consuming to maintain over time. - - in this article you will learn golden rules on what good code looks like, and examples of "code smells" which indicate when code might need refactoring.``` + - In this article you will learn golden rules on what good code looks like, and examples of "code smells" which indicate when code might need refactoring. ??? success "Pre-requisites" @@ -218,4 +228,4 @@ Thinking about your pipelines, did any of the above rules jump out to you as som If you have any other refactoring golden rules that we haven't covered here, do get in touch and let us know! -[coding-best-practice]:../implementing_RAP/coding-best-practice.md \ No newline at end of file +[coding-best-practice]:../implementing_RAP/coding-best-practice.md diff --git a/docs/useful_links.md b/docs/useful_links.md index 495a6be..d6988d6 100644 --- a/docs/useful_links.md +++ b/docs/useful_links.md @@ -1,9 +1,16 @@ --- +title: Useful links + +tags: + - Policy + - Implementing RAP + - Community support + - Useful links hide: - navigation --- -# Useful links +# ## Strategic diff --git a/mkdocs.yml b/mkdocs.yml index b3ae4ad..5eae22f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,6 +13,8 @@ plugins: - redirects: redirect_maps: "training_resources/git/intro-to-git.md": "training_resources/git/introduction-to-git.md" + - tags: + tags_file: tags.md nav: - Home: index.md - About: about.md @@ -20,6 +22,7 @@ nav: - What is RAP?: introduction_to_RAP/what_is_RAP.md - Why RAP is important: introduction_to_RAP/why_RAP_is_important.md - Levels of RAP: introduction_to_RAP/levels_of_RAP.md + - Government policy on RAP: introduction_to_RAP/gov-policy-on-rap.md - What is open source?: introduction_to_RAP/what-is-open-source.md - The history of RAP: introduction_to_RAP/history_of_RAP.md - RAP in health: introduction_to_RAP/RAP_in_health.md @@ -35,6 +38,10 @@ nav: - Implementing RAP: - Are you ready for RAP?: implementing_RAP/rap-readiness.md + - Technical Skills for RAP: + - Git for Engagements: implementing_RAP/skills_for_rap/git_for_rap.md + - Python for Engagements: implementing_RAP/skills_for_rap/python_for_rap.md + - R for Engagements: implementing_RAP/skills_for_rap/r_for_rap.md - Process Mapping: implementing_RAP/process_mapping.md - Code review: implementing_RAP/code-review.md - Coding best practice: implementing_RAP/coding-best-practice.md @@ -54,6 +61,7 @@ nav: - Using Git collaboratively: training_resources/git/using-git-collaboratively.md - Making code discoverable: training_resources/git/making-code-discoverable.md - Git Hooks: training_resources/git/githooks.md + - GitHub vs GitLab Terminology: training_resources/git/github_gitlab_terminology.md - Quick Start Guides: - Git: training_resources/git/quick_start_guides/git_quick_start_guide.md - GitHub: training_resources/git/quick_start_guides/github_quick_start_guide.md @@ -92,6 +100,7 @@ nav: - Support: support.md - Glossary: glossary.md - Useful links: useful_links.md + - Tags: tags.md - Site info: - Acknowledgements: site_info/acknowledgements.md - Website Release Process: site_info/rap-release-workflow.md @@ -113,6 +122,7 @@ theme: - content.tabs.link - navigation.tabs - navigation.indexes + - toc.integrate icon: admonition: : material/alert @@ -121,7 +131,7 @@ extra_css: markdown_extensions: - def_list - pymdownx.tasklist: - # clickable_checkbox: true + clickable_checkbox: true custom_checkbox: true - tables - attr_list diff --git a/overrides/main.html b/overrides/main.html index f3bfdc7..f7ecd46 100644 --- a/overrides/main.html +++ b/overrides/main.html @@ -7,6 +7,15 @@ {% endif %} + +{% if page and page.meta %} +

{{ page.meta.title }}

+ + {% if page.meta.summary %} +
{{ page.meta.summary }}
+ {% endif %} +{% endif %} + {{ super() }}
diff --git a/utils/list-pages-without-tags.py b/utils/list-pages-without-tags.py new file mode 100644 index 0000000..b367513 --- /dev/null +++ b/utils/list-pages-without-tags.py @@ -0,0 +1,28 @@ +# prints list of files that do not have tags defined +# run with: python utils/list-pages-without-tags.py +import os + +# Function to extract tags from a markdown file +def file_has_tags(file_path): + + with open(file_path, 'r') as file: + + for line in file: + + if line.strip().startswith('tags:'): + return True + + return False + +# Get list of markdown files +md_files = [] +for root, dirs, files in os.walk('docs'): + for file in files: + if file.endswith('.md'): + md_files.append(os.path.join(root, file)) + +# Find files without tags +files_without_tags = [] +for file_path in md_files: + if not file_has_tags(file_path): + print(file_path) \ No newline at end of file diff --git a/utils/list-tags.py b/utils/list-tags.py new file mode 100644 index 0000000..c650735 --- /dev/null +++ b/utils/list-tags.py @@ -0,0 +1,56 @@ +# prints all tags appearing in the .md files in /docs to terminal +# run with: python utils/list-tags.py +import os +import re + +# Function to extract tags from a markdown file +def extract_tags_from_file(file_path): + + tags = [] + with open(file_path, 'r') as file: + + inside_tags = False + + for line in file: + # check we have reached the tags property + if line.strip().startswith('tags:'): + inside_tags = True + continue + + # if we're not inside the tags property, keep looping + if not inside_tags: + continue + + # if the first character is not '-' then the line is no longer in the tags property - break + if not line.strip().startswith('-'): + break +# if the second character is '-' then we are on the '---' line which ends the front matter - break + + if line.strip()[1] == '-': + break + + # Extract tag (removing leading "- ") + tag = line.strip()[2:] + tags.append(tag) + + return tags + +# Get list of markdown files +md_files = [] +for root, dirs, files in os.walk('docs'): + for file in files: + if file.endswith('.md'): + md_files.append(os.path.join(root, file)) + +# Extract tags from each markdown file +all_tags = [] +for file_path in md_files: + tags = extract_tags_from_file(file_path) + all_tags.extend(tags) + +# Remove duplicates and sort tags +sorted_unique_tags = sorted(set(all_tags)) + +# Print sorted tags list +for tag in sorted_unique_tags: + print(tag) \ No newline at end of file diff --git a/utils/move-header-to-front-matter.py b/utils/move-header-to-front-matter.py new file mode 100644 index 0000000..7695080 --- /dev/null +++ b/utils/move-header-to-front-matter.py @@ -0,0 +1,49 @@ +# loops through all .md files +# Where front matter exists, takes the page header '#' and moves it to the title property of the front matter +# adds a blank h1 '#' at the end of the page (material seems to require this somewhere) +# run with: python utils/move-header-to-front-matter.py +import os + +def move_header_to_front_matter(filepath, header): + lines = open(filepath).read().splitlines() + lines.insert(1, ("title: " + header) ) + lines.append("") + lines.append("# ") + for i, line in enumerate(lines): + if line.strip().startswith("# "): + del lines[i] + del lines[i] + open(filepath, mode='w').write('\n'.join(lines)) + +def get_header(file_path): + + with open(file_path, 'r') as file: + + has_front_matter = False + for line in file: + + if line.strip().startswith('---'): + has_front_matter = True + + # No front matter - return False + if not has_front_matter: + return False + + if line.strip().startswith('# '): + return line[2:] + + +# Get list of markdown files +md_files = [] +for root, dirs, files in os.walk('docs'): + for file in files: + if file.endswith('.md'): + md_files.append(os.path.join(root, file)) + +# move headers to front matter +files_without_tags = [] +for file_path in md_files: + print(file_path) + header = get_header(file_path) + if header: + move_header_to_front_matter(file_path, header) \ No newline at end of file