### 1. Append rounds
Combine feedback data from round1 and round2

In [4]:
import pandas as pd

# Load the two feedback files
round1_file = "../data/processed/round1_feedback.csv"
round2_file = "../data/processed/round2_feedback.csv"

round1_feedback = pd.read_csv(round1_file)
round2_feedback = pd.read_csv(round2_file)

# Append the two DataFrames
combined_feedback = pd.concat([round1_feedback, round2_feedback], ignore_index=True)

# Save the combined data to a new file
output_file = "../data/processed/combined_feedback.csv"
combined_feedback.to_csv(output_file, index=False)

print(f"Combined feedback saved to {output_file}")


Combined feedback saved to ../data/processed/combined_feedback.csv


### 2. Extract text from PDFs
 Extract text from PDF files and merge into the combined dataset

In [2]:
import os
from PyPDF2 import PdfReader  

# Extract text from the pdfs
def extract_text_from_pdfs(directory):
    pdf_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            try:
                reader = PdfReader(pdf_path)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                pdf_data.append({"Feedback reference": filename[:-4], "text": text})  #Removes the ".pdf" from the filename
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
    return pdf_data

# Path to the folder
pdf_folder = "../data/raw"

# Extract text and turn into a df
pdf_data = extract_text_from_pdfs(pdf_folder)
df = pd.DataFrame(pdf_data)

print(df.head())

  Feedback reference                                               text
0           F2665480   \n \n \n \n \n \nHOW THE EU CAN ACHIEVE LEGAL...
1           F2665494   \n \n \n1 \n \n \nAI EU ACT: Main issues iden...
2           F2665443   \n \nLiberty Global – supplementary position ...
3           F2665331  Feedback provided by AstraZeneca:  \nArtificia...
4           F2665641   \n 1  U.S. Chamber of Commerce Comments conce...


In [4]:
# Merge extracted text with combined feedback
complete_pdfs_df = pd.merge(combined_feedback, df, on="Feedback reference", how="left")
complete_pdfs_df['text'] = complete_pdfs_df['text'].fillna(complete_pdfs_df['Additional Message'])

In [5]:
# Remove unnecessary columns
columns_to_remove = ["Feedback Content", "Feedback Link", "Page", "Submitted by", "Organisation", "Transparency register number", "Initiative", "Additional Message"]
complete_pdfs_df = complete_pdfs_df.drop(columns=columns_to_remove)

In [6]:
# Remove rows where 'text' column is empty
complete_pdfs_df = complete_pdfs_df.dropna(subset=['text'])
print(complete_pdfs_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 0 to 436
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Organization        434 non-null    object
 1   Feedback reference  434 non-null    object
 2   Submitted on        434 non-null    object
 3   User type           420 non-null    object
 4   Organisation size   401 non-null    object
 5   Country of origin   420 non-null    object
 6   text                434 non-null    object
dtypes: object(7)
memory usage: 27.1+ KB
None


###  3. Data Cleaning

In [7]:
import re

# List of regex patterns and their replacements
CLEANING_PATTERNS = [
    (r'\s+', ' '),  # Remove excessive whitespace and line breaks
    (r'[^\x00-\x7F]+', ' '),  # Remove non-ASCII characters
    (r'^\d+\s+', ''),  # Remove leading numbers
    (r'(\b\d+\.\s?)+', ''),  # Remove enumerations like '1. 2. 3.'
    (r'(\b\d+\:\s?)+', ''),  # Remove patterns like '1: 2: 3:'
    (r'(\b\d+\sId\.)+', ''),  # Remove patterns like '9 Id. 10 Id.'
    (r'\bId\.\b', ''),  # Remove dangling 'Id.'
    (r'https?://\S+|www\.\S+', ''),  # Remove URLs
    (r'(\b\d{2,}\.\s?)+', ''),  # Remove enumerations like '10.', '11.'
    (r'\b(I{1,3}|IV|V|VI{0,3}|IX|X)\.\b', ''),  # Remove Roman numerals like 'III.', 'II.'
    (r'\b\d+[a-zA-Z]?\)', ''),  # Remove patterns like '3a)', '4b)', '2)'
    (r'\b\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}\b', ''),  # Remove phone numbers
    (r'\(\d{3}\)\s?\d{3}[-\.\s]?\d{4}\b', ''),  # Remove phone numbers in brackets
    (r'\b(x-axis|y-axis|z-axis|figure\s?\d+|chart\s?\d+|legend|data points)\b', '', re.IGNORECASE),  # Remove graph-related text
    (r'(Table of Contents|CONTENTS).*?(\d+)(\s|$)', '', re.IGNORECASE),  # Remove Table of Contents section
    (r'\.{2,}\s?\d+', ''),  # Remove dot leaders followed by page numbers
    (r'^\s*(I{1,3}|IV|V|VI{0,3}|IX|X)\.\s+.*$', '', re.MULTILINE),  # Remove Roman numeral sections
]


# Data cleaning function
def clean_text(text, patterns):
    """Clean and preprocess text using modular patterns."""
    if not isinstance(text, str):
        return ""  # Handle non-string inputs
    
    for pattern, replacement, *flags in patterns:
        if flags:  # If regex flags are provided
            text = re.sub(pattern, replacement, text, flags=flags[0])
        else:
            text = re.sub(pattern, replacement, text)
    
    # Normalize whitespace after all cleaning
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Example usage with a DataFrame
def process_dataframe(input_df, output_file):
    """Clean the 'text' column of a DataFrame and save it to a CSV file."""
    # Ensure the DataFrame is a copy
    cleaned_df = input_df.copy()
    
    # Apply the cleaning function to the 'text' column
    cleaned_df['text'] = cleaned_df['text'].apply(lambda x: clean_text(x, CLEANING_PATTERNS))
    
    # Save the cleaned DataFrame to a CSV file
    cleaned_df.to_csv(output_file, index=False)
    print(f"Cleaned text data saved to {output_file}")
    return cleaned_df

# Apply cleaning to the DataFrame
cleaned_pdfs_df = process_dataframe(complete_pdfs_df, output_file)

# Preview cleaned text
print(cleaned_pdfs_df['text'].head())


Cleaned text data saved to ../data/processed/combined_feedback.csv
0    September 10, 2020 Consumer Technology Associa...
1    CDT welcomes the opportunity to provide input ...
2    1 June 2020 European Commission's White Paper ...
3    September 2020 EuroCommerce contributions to t...
4    Slovensk aliancia pre inovat vnu ekonomiku Gr ...
Name: text, dtype: object


In [8]:
cleaned_pdfs_df

Unnamed: 0,Organization,Feedback reference,Submitted on,User type,Organisation size,Country of origin,text
0,Consumer Technology Association (United States),F551055,10 September 2020,Business association,Medium (50 to 249 employees),United States,"September 10, 2020 Consumer Technology Associa..."
1,Center for Democracy & Technology (United States),F551054,10 September 2020,Non-governmental organisation (NGO),Small (10 to 49 employees),United States,CDT welcomes the opportunity to provide input ...
2,ETNO - European Telecommunications Network Ope...,F551053,10 September 2020,Business association,Micro (1 to 9 employees),Belgium,1 June 2020 European Commission's White Paper ...
3,EuroCommerce (Belgium),F551052,10 September 2020,Business association,Small (10 to 49 employees),Belgium,September 2020 EuroCommerce contributions to t...
4,Slovak Alliance for Innovation Economy (Slovakia),F551051,10 September 2020,Business association,Micro (1 to 9 employees),Slovakia,Slovensk aliancia pre inovat vnu ekonomiku Gr ...
...,...,...,...,...,...,...,...
432,NEC Laboratories Europe GmbH (Germany),F2256824,05 May 2021,Academic/research Institution,Medium (50 to 249 employees),Germany,Test.
433,Agence du Numérique (AdN) (Belgium),F2256808,05 May 2021,Company/business,Small (10 to 49 employees),Belgium,Please find below our feedback on the AI impac...
434,gauthier lasou (France),F2256463,28 April 2021,EU citizen,,France,Un r glement europ en qui va d finir des exige...
435,SB Science Management UG (haftungsbeschränkt) ...,F2242340,27 April 2021,Company/business,Micro (1 to 9 employees),Germany,"Unfortunately, standardisation measures are st..."


In [124]:
# Replace NaN in 'text' column with an empty string explicitly
cleaned_pdfs_df['text'] = cleaned_pdfs_df['text'].fillna("").astype(str)

# Save cleaned data with UTF-8 encoding
output_file = "../data/processed/cleaned_data.csv"
cleaned_pdfs_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Data saved to {output_file}")


Data saved to ../data/processed/cleaned_data.csv


In [10]:
cleaned_data = pd.read_csv("../data/processed/cleaned_data.csv")
print(f"Number of observations (rows): {len(cleaned_data)}") 

Number of observations (rows): 434


In [12]:
# Filtering duplicates 
def filter_duplicates(df, output_file):
    df = df.drop_duplicates(subset=['text'])
    df.to_csv(output_file, index=False)
    print(f"Duplicates removed and data saved to {output_file}")
    return df

cleaned_data = filter_duplicates(cleaned_data, "../data/processed/cleaned_data.csv")

Duplicates removed and data saved to ../data/processed/cleaned_data.csv


In [13]:
cleaned_data = pd.read_csv("../data/processed/cleaned_data.csv")
print(f"Number of observations (rows): {len(cleaned_data)}") 

Number of observations (rows): 428


### Language Filtering
Separate English feedback from non-English entries.

In [17]:
from langdetect import detect, DetectorFactory, LangDetectException
import langid
DetectorFactory.seed = 0

def detect_language(text):
    try:
        if isinstance(text, str) and text.strip():
            return langid.classify(text)[0]
    except Exception as e:
        print(f"Language detection failed: {e}")
    return "unknown"

cleaned_data['language'] = cleaned_data['text'].apply(detect_language)
english_feedback_df = cleaned_data[cleaned_data['language'] == 'en']
non_english_feedback_df = cleaned_data[cleaned_data['language'] != 'en']

In [18]:
# Count occurrences of each language
language_counts = cleaned_data['language'].value_counts()
print(language_counts)

language
en         379
de          15
fr          13
pl           5
pt           3
an           3
es           3
ro           1
hu           1
cy           1
it           1
unknown      1
nl           1
sv           1
Name: count, dtype: int64


In [19]:
# Save the English feedback dataset
english_feedback_df.to_csv("../data/processed/feedback_en.csv", index=False)

# Save non-English feedback
non_english_feedback_df.to_csv("../data/processed/feedback_non_en.csv", index=False)

In [21]:
feedback_en = pd.read_csv("../data/processed/feedback_en.csv")

In [22]:
feedback_en

Unnamed: 0,Organization,Feedback reference,Submitted on,User type,Organisation size,Country of origin,text,language
0,Consumer Technology Association (United States),F551055,10 September 2020,Business association,Medium (50 to 249 employees),United States,"September 10, 2020 Consumer Technology Associa...",en
1,Center for Democracy & Technology (United States),F551054,10 September 2020,Non-governmental organisation (NGO),Small (10 to 49 employees),United States,CDT welcomes the opportunity to provide input ...,en
2,ETNO - European Telecommunications Network Ope...,F551053,10 September 2020,Business association,Micro (1 to 9 employees),Belgium,1 June 2020 European Commission's White Paper ...,en
3,EuroCommerce (Belgium),F551052,10 September 2020,Business association,Small (10 to 49 employees),Belgium,September 2020 EuroCommerce contributions to t...,en
4,Slovak Alliance for Innovation Economy (Slovakia),F551051,10 September 2020,Business association,Micro (1 to 9 employees),Slovakia,Slovensk aliancia pre inovat vnu ekonomiku Gr ...,en
...,...,...,...,...,...,...,...,...
374,The Value Engineers (Netherlands),F2324448,12 May 2021,Company/business,Micro (1 to 9 employees),Netherlands,The proposed regulation is an excellent initia...,en
375,NEC Laboratories Europe GmbH (Germany),F2256824,05 May 2021,Academic/research Institution,Medium (50 to 249 employees),Germany,Test.,en
376,Agence du Numérique (AdN) (Belgium),F2256808,05 May 2021,Company/business,Small (10 to 49 employees),Belgium,Please find below our feedback on the AI impac...,en
377,SB Science Management UG (haftungsbeschränkt) ...,F2242340,27 April 2021,Company/business,Micro (1 to 9 employees),Germany,"Unfortunately, standardisation measures are st...",en
