### PRELIMINARY DATA CLEANING AND ANALYSIS

### Option 1 preferably better

In [None]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load SpaCy Dutch model
nlp = spacy.load('nl_core_news_lg')

# Define a function for additional cleaning
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\W+', ' ', text)      # Remove all non-word characters
    text = re.sub(r'[0-9]', '', text)     # Remove digits
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', ' ', text)  # Remove dates
    text = re.sub(r'\b\d+\b', ' ', text)  # Remove standalone numbers
    text = re.sub(r'\b00\b', ' ', text)   # Remove double zeroes
    text = re.sub(r'(?<!\S)\d+(?=-[a-zA-Z])', '', text)  # Keep numbers like '24-jarige'
    return text

# Tokenization, stopword removal, and lemmatization
def preprocess_content(text):
    text = clean_text(text)  # Clean text
    tokens = word_tokenize(text, language='dutch')  # Tokenize
    tokens = [token for token in tokens if token not in set(stopwords.words('dutch'))]  # Remove stopwords
    lemmas = [token.lemma_.lower() for token in nlp(' '.join(tokens))]  # Lemmatize and lowercase
    return ' '.join(lemmas)

# Load your dataset
file_path = 'C:/Users/xx/Downloads/Artikelen_Sanders.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Apply the preprocessing function to the 'content' column
df['processed_content'] = df['content'].astype(str).apply(preprocess_content)



### Option 2 Using tokenizer from BERTje

In [None]:
# Cleaning using tokenizer from BERTje
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from transformers import AutoTokenizer

# Load NLTK resources - Dutch stopwords and SpaCy Dutch model
nltk.download('punkt')
nltk.download('stopwords')
dutch_stop_words = set(stopwords.words('dutch'))
additional_stop_words = ['één', 'kommen', 'zeggen','zullen','moeten','gaan', 'wij']  # Add your stop words here

# Add the additional stop words to the Dutch stop words set
dutch_stop_words.update(additional_stop_words)

# Load SpaCy Dutch model
nlp = spacy.load('nl_core_news_lg')

# Initialize the tokenizer for BERTje
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Load dataset
df = pd.read_csv('C:/Users/xx/Downloads/200_random_artikelen.csv')  # Replace with your file path

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to string and format text
    text = str(text)
    text = text.replace("\r\n", " ").strip()

    # Basic cleaning
    text = text.lower()  # Convert to lower case
    
    text = re.sub(r'\W+', ' ', text)  # Remove all non-word characters
    
    # Replace punctuation with space
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Optional: Remove digits
    text = re.sub(r'[0-9]','',text)
    
       
    # Remove dates (formats like 12-12-2000 or 12/12/2000)
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', ' ', text)
    
    # Remove standalone numbers and double zeroes ('00')
    text = re.sub(r'\b\d+\b', ' ', text)
    text = re.sub(r'\b00\b', ' ', text)

    # Replace punctuation with space
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Optional: Remove single characters - might remove valid single-letter words (e.g., 'a', 'I' in English)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    

    # Lemmatization with SpaCy
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]

    # Removing stopwords
    lemmas = [lemma for lemma in lemmas if lemma not in dutch_stop_words]

    # Rejoin lemmatized words
    cleaned_text = ' '.join(lemmas)

    return cleaned_text

# Applying preprocessing to your text column
df['cleaned_text'] = df['content'].apply(preprocess_text)  # Replace 'content' with the actual column name in your dataset

# Applying the tokenizer
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# The resulting 'tokenized_text' column will have the text data tokenized 

### EXTRACT THE MOST COMMON WORDS IN PROCESSED CONTENT

In [None]:
# Filter rows where the length of the text in 'content' is less than 200 symbols RAW DATA
short_text_rows = df[df['content'].str.len() < 200]

# Print these rows along with the text
for index, row in short_text_rows.iterrows():
    print(f"Row {index}:")
    print(row['content'])
    print("---")  # Separator for readability

<strong>FILTER THE ARTICLES WITH LESS THAN 200 SYMBOLS FROM CONTENT</strong>

In [None]:
# Filter rows where the length of the text in 'content' is less than 200 symbols PROCESSED DATA
short_text_rows = df[df['processed_content'].str.len() < 200]

# Print these rows along with the text
for index, row in short_text_rows.iterrows():
    print(f"Row {index}:")
    print(row['processed_content'])
    print("---")  # Separator for readability

In [None]:
import pandas as pd
import re

# Assuming 'df' is your DataFrame loaded from 'Artikelen_Sanders.csv'

# Function to find special characters and count their occurrences RAW DATA
def find_and_count_special_characters(text):
    # Define a regular expression pattern for characters outside of the basic Latin set
    pattern = r'[^\w\s,.?!;:\'"-]'
    # Find all matches
    special_chars = re.findall(pattern, text)
    # Count occurrences of each unique character
    char_count = {char: special_chars.count(char) for char in set(special_chars)}
    return char_count

# Apply the function to each row in your DataFrame
df['special_characters_count'] = df['content'].apply(find_and_count_special_characters)

# To print the type of symbol and its occurrence per row in descending order,
# you can iterate over each row and print the information.
for index, row in df.iterrows():
    # Check if there are special characters in the row
    if row['special_characters_count']:
        # Sort the dictionary by value in descending order
        sorted_chars = dict(sorted(row['special_characters_count'].items(), key=lambda item: item[1], reverse=True))
        print(f"Row {index}: {sorted_chars}")

<strong>Function to find special characters and count their occurrences</strong>


In [None]:
import pandas as pd
import re

# Assuming 'df' is your DataFrame loaded from 'Artikelen_Sanders.csv'

# Function to find special characters and count their occurrences
def find_and_count_special_characters(text):
    # Define a regular expression pattern for characters outside of the basic Latin set
    pattern = r'[^\w\s,.?!;:\'"-]'
    # Find all matches
    special_chars = re.findall(pattern, text)
    # Count occurrences of each unique character
    char_count = {char: special_chars.count(char) for char in set(special_chars)}
    return char_count

# Apply the function to each row in your DataFrame
df['special_characters_count'] = df['content'].apply(find_and_count_special_characters)

# To print the type of symbol and its occurrence per row in descending order,
# you can iterate over each row and print the information.
for index, row in df.iterrows():
    # Check if there are special characters in the row
    if row['special_characters_count']:
        # Sort the dictionary by value in descending order
        sorted_chars = dict(sorted(row['special_characters_count'].items(), key=lambda item: item[1], reverse=True))
        print(f"Row {index}: {sorted_chars}")

In [None]:
import pandas as pd
import re

# Assuming 'df' is your DataFrame loaded from 'Artikelen_Sanders.csv'

# Function to find special characters and count their occurrences PROCESSED DATA
def find_and_count_special_characters(text):
    # Define a regular expression pattern for characters outside of the basic Latin set
    pattern = r'[^\w\s,.?!;:\'"-]'
    # Find all matches
    special_chars = re.findall(pattern, text)
    # Count occurrences of each unique character
    char_count = {char: special_chars.count(char) for char in set(special_chars)}
    return char_count

# Apply the function to each row in your DataFrame
df['special_characters_count'] = df['processed_content'].apply(find_and_count_special_characters)

# To print the type of symbol and its occurrence per row in descending order,
# you can iterate over each row and print the information.
for index, row in df.iterrows():
    # Check if there are special characters in the row
    if row['special_characters_count']:
        # Sort the dictionary by value in descending order
        sorted_chars = dict(sorted(row['special_characters_count'].items(), key=lambda item: item[1], reverse=True))
        print(f"Row {index}: {sorted_chars}")

<strong>Function to count symbols in a string</strong>

In [None]:
 #Assuming your DataFrame is already loaded as 'data'
# data = pd.read_csv('your_file.csv')

# Function to count symbols in a string
def count_symbols(text):
    return len(text)

# Count symbols in 'content' and 'processed_content'
df['content_symbol_count'] = df['content'].apply(count_symbols)
df['processed_content_symbol_count'] = df['processed_content'].apply(count_symbols)

# Select only the relevant columns for display
columns_to_display = ['content', 'processed_content', 'content_symbol_count', 'processed_content_symbol_count']

# Display the table with 'content', 'processed_content', and their symbol counts
print(df[columns_to_display])  # You can adjust the number of rows displayed by changing 'head()'

<strong>Find capitalized words </strong>

In [None]:
def find_capitalized_words(text):
    # Regular expression to find words starting with a capital letter
    pattern = r'\b[A-Z][a-z]*\b'
    return re.findall(pattern, text)

# Iterate through each row and print capitalized words
for index, row in df.iterrows():
    capitalized_words = find_capitalized_words(row['processed_content'])
    print(f"Row {index} capitalized words: {capitalized_words}")

<strong>Convert 'Datum' to datetime</strong>

In [None]:

# Convert 'datum' to datetime
df['Datum'] = pd.to_datetime(df['Datum'], errors='coerce')  # 'coerce' will turn unparseable strings into NaT

# Remove rows where 'datum' is in 1990
# This filters the DataFrame to only include rows where the year is not 1990
df = df[df['Datum'].dt.year != 1990]

# Save the filtered DataFrame back to CSV or continue with further processing
#df.to_csv('filtered_dataset.csv', index=False)  # Save to a new file or overwrite by using file_path

#print("Rows with 'datum' in 1990 have been removed and the dataset is saved.")

<strong>EXPRORE THE PROCESSED_CONTENT COLUMN</strong>

In [None]:
# Iterate through the DataFrame and print each 'processed_content'

for index, row in data.iterrows():
    print(f"Row {index}: {row['processed_content']}\n")

In [None]:
# Specify the index of the row you want to print RAW DATA
row_index = 42  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'content'])
else:
    print(f"Row index {row_index} is out of range.")

In [None]:
# Specify the index of the row you want to print - PROCESSED
row_index = 42  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'processed_content'])
else:
    print(f"Row index {row_index} is out of range.")

<strong>The OCR mistakes which cause misinterpretation of 'Content'</strong>

In [None]:
# Specify the index of the row you want to print RAW DATA
row_index = 632  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'content'])
else:
    print(f"Row index {row_index} is out of range.")

<strong>AN ARTICLE TEXT WITH EXAMPLE OF OCR MISTAKES</strong>

In [None]:
# Specify the index of the row you want to print RAW DATA
row_index = 346  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'content'])
else:
    print(f"Row index {row_index} is out of range.")

## Supershort news item 
Since the numbers have been cleaned it is crucial for us to understand whether we need to change the cleaning conditions or whether such constructs are not as important for our research

In [None]:
# Specify the index of the row you want to print RAW DATA
row_index = 295  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'content'])
else:
    print(f"Row index {row_index} is out of range.")

In [None]:
# Specify the index of the row you want to print RAW DATA
row_index = 40  # Replace with the index of the row you're interested in

# Access and print the text of the specified row
if row_index < len(data):
    print(f"Text in row {row_index}:")
    print(data.loc[row_index, 'content'])
else:
    print(f"Row index {row_index} is out of range.")

## Symbol count

In [None]:
 #Function to count symbols in each row
def count_symbols(text):
    return len(text)

# Apply the function to the 'processed_content' column
data['symbol_count'] = data['processed_content'].apply(count_symbols)

# Print the symbol counts for each row
for index, row in data.iterrows():
    print(f"Row {index} has {row['symbol_count']} symbols in processed_content.")

# Now data DataFrame includes the 'symbol_count' column
# You can save this DataFrame if needed
# data.to_csv('your_dataset_with_symbol_count.csv', index=False)


<strong>FILTER THE ROWS WITH LESS THAN 200 SYMBOLS IN PROCESSED_CONTENT</strong>

In [None]:
# Filter rows where 'symbol_count' is less than 200 in Processed_content
filtered_data = data[data['symbol_count'] < 200]

# Print the 'processed_content' of these rows
print("Processed Content with Less Than 200 Symbols:")
for index, row in filtered_data.iterrows():
    print(f"Row {index}: {row['processed_content']}\n")


 ###  <strong>Capitalized letters check after cleaning and preprocessing</strong>

In [None]:
df=data

def find_capitalized_words(text):
    # Regular expression to find words starting with a capital letter
    pattern = r'\b[A-Z][a-z]*\b'
    return re.findall(pattern, text)

# Iterate through each row and print capitalized words
for index, row in df.iterrows():
    capitalized_words = find_capitalized_words(row['processed_content'])
    print(f"Row {index} capitalized words: {capitalized_words}")

In [None]:
def find_capitalized_and_special_words(text):
    # Regular expression for words starting with a capital letter
    start_cap_pattern = r'\b[A-Z][a-z]*\b'
    
    # Pattern for words with internal capital letters (camelCase)
    camel_case_pattern = r'\b[a-z]+[A-Z][a-zA-Z]*\b'
    
    # Pattern for all-uppercase words
    uppercase_pattern = r'\b[A-Z]+\b'

    # Finding matches
    start_cap_words = re.findall(start_cap_pattern, text)
    camel_case_words = re.findall(camel_case_pattern, text)
    uppercase_words = re.findall(uppercase_pattern, text)

    return start_cap_words, camel_case_words, uppercase_words

# Apply the function to the 'processed_content' column
data[['start_cap_words', 'camel_case_words', 'uppercase_words']] = data['processed_content'].apply(
    lambda x: pd.Series(find_capitalized_and_special_words(x))
)

# Display the DataFrame to see the results
print(data[['processed_content', 'start_cap_words', 'camel_case_words', 'uppercase_words']])


### DEFINE KEY WORDS IN PROCESSED CONTENT

In [None]:
# Define the key words in 'processed_content' and print the filtered rows and their numbers
# Load the DataFrame
df = data

# Define the list of words you're interested in
words_of_interest = ['arbeid', 'turkse', 'turk', 'marokkaanse', 'marokkaans', 'vrouw', 'meisje', 'dame']

# Function to check if any word of interest is in the content
def contains_word_of_interest(processed_content):
    return any(word in processed_content for word in words_of_interest)

# Filter the DataFrame
filtered_df = df[df['processed_content'].apply(contains_word_of_interest)]

# Print the 'content' of these filtered rows along with their row number
for index, processed_content in filtered_df['processed_content'].iteritems():
    print(f"Row {index}: {processed_content}\n")


<strong>Preprocess 'Titel' column</strong>

In [None]:
# Preprocess 'Titel' column + save the cleaned_dataset

# Load the dataset
df = data

# Function to clean text
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zäöüß]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the cleaning function to the 'Titel' column
df['Titel'] = df['Titel'].astype(str).apply(clean_text)

# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_newspaper_articles.csv', index=False)


<strong>DEFINE KEY WORDS IN 'TITEL' COLUMN</strong>

In [None]:

# Load the DataFrame
data = df

# Define the list of words you're interested in
words_of_interest = ['arbeid', 'turkse', 'turk','marokkaanse', 'marokkaans','vrouw', 'meisje', 'dame']  

# Function to check if any word of interest is in the content
def contains_word_of_interest(Titel):
    return any(word in Titel for word in words_of_interest)

# Filter the DataFrame
filtered_df = data[data['Titel'].apply(contains_word_of_interest)]

# Print the 'content' of these filtered rows along with their row number
for index, Titel in filtered_df['Titel'].iteritems():
    print(f"Row {index}: {Titel}\n")

In [None]:
# Load your dataframe
# df = pd.read_csv('your_file.csv')

# Define the words of interest
words = ['turkse ', 'marokkanse', 'vrouw']

# Initialize a dictionary to hold word counts
word_counts = {word: {'content': 0, 'processed_content': 0} for word in words}

# Count the occurrences in both columns
for word in words:
    word_counts[word]['content'] = df['content'].str.count(word).sum()
    word_counts[word]['processed_content'] = df['processed_content'].str.count(word).sum()

# Calculate total word count in both columns
total_words_content = sum(df['content'].str.split().apply(len))
total_words_processed_content = sum(df['processed_content'].str.split().apply(len))

# Calculate the percentage
word_percentages = {word: {'content': (count['content'] / total_words_content) * 100,
                           'processed_content': (count['processed_content'] / total_words_processed_content) * 100}
                    for word, count in word_counts.items()}

# Print the results
for word, percentages in word_percentages.items():
    print(f"Word: {word}")
    print(f"Percentage in 'content': {percentages['content']:.2f}%")
    print(f"Percentage in 'processed_content': {percentages['processed_content']:.2f}%\n")

## Concordance

In [None]:
import nltk
from nltk.text import Text

# Load your dataframe
# df = pd.read_csv('your_file.csv')

# Define the words of interest
words_of_interest = ['arbeid', 'man', 'vrouw']

# Assuming 'processed_content' contains preprocessed and tokenized data
# Convert each row of 'processed_content' into a list of words
tokenized_contents = [content.split() for content in df['processed_content']]

# Flatten the list of tokenized contents
all_tokens = [token for content in tokenized_contents for token in content]

# Create NLTK text object
nltk_text = Text(all_tokens)

# Define the width of the context window
# variable controls how many words are shown around the target word. You can adjust this to see more or less context.
context_window = 10  # Adjust as needed 

# Function to print concordance for each word of interest
# The lines parameter in the concordance method determines how many examples are shown for each word. 
# You can increase or decrease this number based on your needs.

def print_concordance(word):
    print(f"Concordance for '{word}':")
    nltk_text.concordance(word, width=context_window*2, lines=15)
    print("\n")

# Print concordance for each word
for word in words_of_interest:
    print_concordance(word)

### Option 2 Using tokenizer from BERTje

In [None]:
# Cleaning using tokenizer from BERTje
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from transformers import AutoTokenizer

# Load NLTK resources - Dutch stopwords and SpaCy Dutch model
nltk.download('punkt')
nltk.download('stopwords')
dutch_stop_words = set(stopwords.words('dutch'))
additional_stop_words = ['één', 'kommen', 'zeggen','zullen','moeten','gaan', 'wij']  # Add your stop words here

# Add the additional stop words to the Dutch stop words set
dutch_stop_words.update(additional_stop_words)

# Load SpaCy Dutch model
nlp = spacy.load('nl_core_news_lg')

# Initialize the tokenizer for BERTje
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

# Load dataset
df = pd.read_csv('C:/Users/xx/Downloads/200_random_artikelen.csv')  # Replace with your file path

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to string and format text
    text = str(text)
    text = text.replace("\r\n", " ").strip()

    # Basic cleaning
    text = text.lower()  # Convert to lower case
    
    text = re.sub(r'\W+', ' ', text)  # Remove all non-word characters
    
    # Replace punctuation with space
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Optional: Remove digits
    text = re.sub(r'[0-9]','',text)
    
       
    # Remove dates (formats like 12-12-2000 or 12/12/2000)
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', ' ', text)
    
    # Remove standalone numbers and double zeroes ('00')
    text = re.sub(r'\b\d+\b', ' ', text)
    text = re.sub(r'\b00\b', ' ', text)

    # Replace punctuation with space
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Optional: Remove single characters - might remove valid single-letter words (e.g., 'a', 'I' in English)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    

    # Lemmatization with SpaCy
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]

    # Removing stopwords
    lemmas = [lemma for lemma in lemmas if lemma not in dutch_stop_words]

    # Rejoin lemmatized words
    cleaned_text = ' '.join(lemmas)

    return cleaned_text

# Applying preprocessing to your text column
df['cleaned_text'] = df['content'].apply(preprocess_text)  # Replace 'content' with the actual column name in your dataset

# Applying the tokenizer
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# The resulting 'tokenized_text' column will have the text data tokenized 

 ### Drop the 'Unnamed: 0' column

In [None]:
# Drop the 'Unnamed: 0' column which is likely the old index from the CSV file
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

# Check if the 'content' column exists and filter the DataFrame
if 'content' in df.columns:
    df = df[df['content'].apply(lambda x: len(str(x)) >= 200)]

# Reset the index of the filtered DataFrame
df.reset_index(drop=True, inplace=True)

# Save the filtered and reset DataFrame
cleaned_dataset_path = "C:/Users/xx/Downloads/200_random_artikelen_filtered.csv"  # Replace with your desired output path
df.to_csv(cleaned_dataset_path, index=False)

# Optional: Print the DataFrame to verify the changes
print(df.head())

### Count the number of symbols 

In [None]:
# Count the number of symbols in each row of the 'content' column
df['Symbol_Count'] = df['cleaned_text'].apply(lambda x: len(str(x)))

# Display the results
print(df[['cleaned_text', 'Symbol_Count']])

### Option 3 Using retokenization and calculating the percentages

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy
import re
import string

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load spaCy Dutch model
nlp = spacy.load("nl_core_news_lg")

# Function to clean text using NLTK and spaCy for Dutch
def clean_text(text):
    # Normalize text: lowercase and remove punctuation
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    
    # Tokenize with spaCy
    doc = nlp(text)
    spacy_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    
    # Tokenize with NLTK
    nltk_tokens = nltk.word_tokenize(text)
    dutch_stopwords = stopwords.words('dutch')
    nltk_tokens = [word for word in nltk_tokens if word not in dutch_stopwords and word.isalnum()]
    
    # Combine, deduplicate tokens, and ensure no broken words
    combined_tokens = retokenize(spacy_tokens + nltk_tokens)
    
    # Recompose text from tokens without spaces
    cleaned_text = ''.join(combined_tokens)
    
    return cleaned_text, len(spacy_tokens), len(nltk_tokens)

# Function to recompose the words, remove the spaces, and put tokens in lower letters
def retokenize(tokenized_text):
    tokenized = []
    is_broken_word = False
    temp_word = ""
    for token in tokenized_text:
        token_lower = token.lower()
        if len(token) == 1:
            if not is_broken_word:
                is_broken_word = True
                temp_word = token_lower
            else:
                temp_word += token_lower
        else:
            if is_broken_word:
                tokenized.append(temp_word)
                temp_word = ""
                is_broken_word = False
            tokenized.append(token_lower)
    if is_broken_word:
        tokenized.append(temp_word)
    return tokenized

# Read CSV
df = pd.read_csv('C:/Users/xx/Downloads/Artikelen_Sanders.csv')

# Apply cleaning function
results = df['content'].apply(lambda x: clean_text(x))
df['processed_content'] = results.apply(lambda x: x[0])
df['spacy_word_count'] = results.apply(lambda x: x[1])
df['nltk_word_count'] = results.apply(lambda x: x[2])

# Calculate percentages
df['spacy_percentage'] = df['spacy_word_count'] / (df['spacy_word_count'] + df['nltk_word_count']) * 100
df['nltk_percentage'] = df['nltk_word_count'] / (df['spacy_word_count'] + df['nltk_word_count']) * 100

# Save back to CSV
df.to_csv('W_processed.csv', index=False)

### Check the processed content

In [None]:

# Load the processed dataset
#df = pd.read_csv('your_dataset_processed.csv')

# Access and print the 'processed_content' of row number 5 - insert the corresponding row
# Remember that Python uses 0-based indexing, so row number 5 is actually at index 4
row_content = df['processed_content'].iloc[4]

print(row_content)

### Further steps 

In [None]:
import pandas as pd
import spacy

# Load the Dutch language model
nlp = spacy.load("nl_core_news_lg")

# Function to tokenize text using spaCy
def tokenize_text(text):
    # Process the text with the NLP object
    doc = nlp(text)
    # Create a list of tokenized words
    tokens = [token.text for token in doc]
    return tokens

# Load your dataset
df = pd.read_csv('C:/Users/xx/W_processed.csv')

# Tokenize the 'processed_content' column
df['tokenized_content'] = df['processed_content'].apply(tokenize_text)

# Now, 'tokenized_content' contains the tokenized words as lists
# If you need to save this DataFrame, consider converting the lists to strings
df['tokenized_content_str'] = df['tokenized_content'].apply(lambda x: ' '.join(x))

# Save the updated DataFrame to a new CSV file
df.to_csv('W1_tokenized.csv', index=False)
