## We are obliged to use different functions (even tough they are similar) according to the waves because the documentation, located in the *"questionnaire"* folder, is different according to the waves, which leads our regex functions not to work for all the waves. 

<br>
<br>

> `get_description_12(var_name, text_to_scrap)` is the function that scraps the documentation for wave 1 and 2

In [1]:
import re

def get_description_12(var_name, text_to_scrap):
    
    #create a regular expression to match the pattern "VARIABLE_NAME UPPERCASE_DESCRIPTION"
    regex = r"\b" + re.escape(var_name) + r"\b.*?([A-Z][A-Z0-9 ]+)"
    
    #use the search method to find the first match in the text
    match = re.search(regex, text_to_scrap)
    
    #if a match was found, return the description
    if match:
        return match.group(1)
    
    #if no match was found, try again with a modified variable name
    #that removes the letters after the 3 digits
    modified_var_name = re.sub(r'(^[A-Z]{2}\d{3}).*', r'\1_', var_name)
    regex = r"\b" + re.escape(modified_var_name) + r"\b.*?([A-Z][A-Z0-9 ]+)"
    match = re.search(regex, text_to_scrap)
    if match:
        return match.group(1)
    
    #if no match was found, check if the variable name has the correct format
    if not re.match(r'^[A-Z]{2}\d{3}_?$', var_name):
        return "Impossible to find the name associated with this variable"
    
    #if the variable name has the correct format, return none
    return None

<br>
<br>

> `get_description_3(var_name, text_to_scrap)` is the function that scraps the documentation for wave 3

In [2]:
import re

def get_description_3(var_name, text_to_scrap):
    
    text_to_scrap = re.sub(r'  ',r' ',text_to_scrap)
    
    #create a regular expression to match the pattern "VARIABLE_NAME UPPERCASE_DESCRIPTION"
    modified_var_name = re.sub(r'(^[A-Z]{2}\d{3}_).*', r'\1', var_name)
    regex = r"\b" + re.escape(var_name) + r"\b.*?([A-Z][A-Z0-9 ]+)"
    
    #use the search method to find the first match in the text
    match = re.search(regex, text_to_scrap)
    
    #if a match was found, return the description
    if match:
        return match.group(1)
    
    #if no match was found, try again with a modified variable name
    #that removes the letters after the 3 digits
    modified_var_name = re.sub(r'(^[A-Z]{2}\d{3}).*', r'\1', var_name)
    regex = r"\b" + re.escape(modified_var_name) + r"\b.*?([A-Z][A-Z0-9 ]+)"
    match = re.search(regex, text_to_scrap)
    if match:
        return match.group(1)
    
    #if no match was found, check if the variable name has the correct format
    if not re.match(r'^[A-Z]{2}\d{3}_?$', var_name):
        return "Impossible to find the name associated with this variable"
    
    #if the variable name has the correct format, return none
    return "Impossible to find the name associated with this variable"

<br>
<br>

> `get_description_45678(var_name, text_to_scrap)` is the function that scraps the documentation for wave 4, 5, 6, 7 and 8

In [3]:
import re

def get_description_45678(var_name, text_to_scrap):
    
    #check if the variable name has the classic structure
    classic_pattern = r'^[A-Z]{2}[0-9]{3}_'
    classic_match = re.match(classic_pattern, var_name)
    
    if classic_match:
        #extract the description using a regular expression
        var_name = re.sub(r'(^[A-Z]{2}\d{3}).*', r'\1_', var_name)
        description_pattern = r"\b" + re.escape(var_name) + r".*?(\w+)\s"
        description_match = re.search(description_pattern, text_to_scrap)
        if description_match:
            return description_match.group(1)
        else:
            return "Impossible to find the name associated with this variable"
    
    else:
        #if the variable name does not have the classic structure
        special_pattern = r'^[A-Z]{2}[0-9]{3}'
        special_match = re.search(special_pattern, var_name)
        if special_match:
            # If it does, treat it as if it had an underscore at the end and extract the description
            modified_var_name = re.sub(r'(^[A-Z]{2}\d{3}).*', r'\1_', var_name)
            description_pattern = r"\b" + re.escape(modified_var_name) + r".*?(\w+)\s"
            description_match = re.search(description_pattern, text_to_scrap)
            if description_match:
                return description_match.group(1)
            else:
                return "Impossible to find the name associated with this variable"
        
        else:
            #if the variable name does not match either pattern, return an error message
            return "Impossible to find the name associated with this variable"

<br>
<br>

> `remove_selected()` is a function that removes the pattern "(SELECTED)" from a text. In our case, we use it for the documentation of the wave 1 and 2 since this pattern initially prevented us from accessing the full description of the variable in some cases

In [4]:
import re

#function that removes the "(SELECTED)" pattern from a text
def remove_selected(string):
    return re.sub(r"\(SELECTED\)", "", string)

<br>
<br>

> `extract_text_from_pdf()` takes a pdf file as argument and returns the text contained

In [5]:
import PyPDF2

def extract_text_from_pdf(pdf_file):
    
    #open the PDF file
    with open(pdf_file, "rb") as file:
        
        #create a PDF object
        pdf = PyPDF2.PdfReader(file)
        
        #iterate through each page and extract the text
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        
        #return the text
        return text

<br>
<br>

> `change_case()` ensures that the first letter is in upper case and others in lower
depending on the wave since they don't have same syntax

In [6]:
def change_case(string, wave):
    if wave in [1,2,3]:
        return string[0].upper() + string[1:].lower()
    else:
        return string