In [1]:
import pandas as pd
import spacy
from spacy.lang.nl.stop_words import STOP_WORDS
import re

# Download NL Core News LG model for Dutch language
try:
    nlp = spacy.load('nl_core_news_lg')
except:
    !python -m spacy download nl_core_news_lg


In [2]:

def clean_text(text):
    # Remove special characters, digits, stand alone letters, and non-word symbols
    cleaned_text = re.sub(r'[^a-zA-Z\s]|(?<!\w)\w(?!\w)', '', text)
    
    # Keep expressions like '24-jarige'
    cleaned_text = re.sub(r'\b(\d+-[a-zA-Z]+)\b', r'\1', cleaned_text)
    
    assert clean_text('24-jarige') == '24-jarige'
    assert clean_text('1-1-1990').strip() == ''
    
    return cleaned_text

In [3]:
# Cell 3: Function for lowercasing text

def lowercase_text(text):
    return text.lower()



In [4]:
# Cell 4 Test the lowercase_text function
assert lowercase_text('Hello World') == 'hello world'


In [5]:
# Cell 5 Function for tokenization
def tokenize_text(text):
    return [token.text for token in nlp(text)]


In [6]:
# Cell 6 Test the tokenize_text function with Dutch words
assert tokenize_text('Dit is een testzin.') == ['Dit', 'is', 'een', 'testzin', '.']

In [7]:
# Cell 7 Function for removing stop words
def remove_stopwords(tokens):
    return [token for token in tokens if token not in STOP_WORDS]


In [8]:
# Cell 8 Test the remove_stopwords function with Dutch words
assert remove_stopwords(['Dit', 'is', 'een', 'testzin', '.']) == ['Dit', 'testzin', '.']


In [9]:
# Cell 9  Function for lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [10]:
# Cell 10 Test the lemmatize_text function
assert lemmatize_text('lopen') == ['lopen']

In [11]:
# Cell 11 Load input from file and remove useless rows
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path, index_col=0)
    df = df[df['content'].apply(len) >= 200]
    df = df[~df['Datum'].str.contains('1990')]
    return df

# Test the load_and_clean_data function with actual file path
df = load_and_clean_data('C:/Users/xx/Downloads/Artikelen_Sanders.csv')

# No assertion as it's a data preprocessing step


In [12]:
# Cell 12 Preprocess content and title
def preprocess_data(df):
    # Clean text in content and title columns
    df['clean_content'] = df['content'].apply(clean_text)
    df['clean_title'] = df['Titel'].apply(clean_text)
    
    # Lowercase text
    df['clean_content'] = df['clean_content'].apply(lambda x: x.lower())
    df['clean_title'] = df['clean_title'].apply(lambda x: x.lower())
    
    # Tokenization
    df['tokenized_content'] = df['clean_content'].apply(tokenize_text)
    df['tokenized_title'] = df['clean_title'].apply(tokenize_text)
    
    # Remove stop words
    df['tokenized_content'] = df['tokenized_content'].apply(remove_stopwords)
    df['tokenized_title'] = df['tokenized_title'].apply(remove_stopwords)
    
    # Lemmatization
    df['lemmatized_content'] = df['clean_content'].apply(lemmatize_text)
    df['lemmatized_title'] = df['clean_title'].apply(lemmatize_text)
    
    return df


In [13]:
# Cell 13 Save preprocessed data to file
def save_preprocessed_data(df, output_file):
    df.to_csv(output_file, index=False, encoding='utf-8'))


In [19]:
# Cell 14 Test the preprocessing pipeline
file_path = 'C:/Users/xx/Downloads/Artikelen_Sanders (1).csv'
output_file_path = 'C:/Users/xx/Desktop/PrS_Artikelen_Sanders.csv'



In [None]:
df = load_and_clean_data(file_path)
preprocessed_df = preprocess_data(df)
save_preprocessed_data(preprocessed_df, output_file_path)


In [None]:
preprocessed_df

In [28]:
# Cell 1: Function to check for stand-alone numbers in a DataFrame column

def check_stand_alone_numbers(preprocessed_df, column_name):
    stand_alone_numbers = []
    for index, text in enumerate(preprocessed_df[column_name]):
        numbers = re.findall(r'\b\d+\b', text)
        if numbers:
            stand_alone_numbers.append((index, numbers))
    return stand_alone_numbers

# Test the check_stand_alone_numbers function on 'clean_content' column
stand_alone_numbers_content = check_stand_alone_numbers(preprocessed_df, 'clean_content')
print("Stand-alone numbers in 'clean_content' column:", stand_alone_numbers_content)


Stand-alone numbers in 'clean_content' column: []


In [37]:
# Cell 1: Function to check for stand-alone numbers in a DataFrame column

def check_stand_alone_numbers(preprocessed_df, column_name):
    stand_alone_numbers = []
    for index, text in enumerate(preprocessed_df[column_name]):
        numbers = re.findall(r'\b\d+\b', text)
        if numbers:
            stand_alone_numbers.append((index, numbers))
    return stand_alone_numbers

# Test the check_stand_alone_numbers function on 'clean_content' column
stand_alone_numbers_content = check_stand_alone_numbers(preprocessed_df, 'clean_title')
print("Stand-alone numbers in 'clean_title' column:", stand_alone_numbers_content)


Stand-alone numbers in 'clean_title' column: []


In [36]:
# Cell 2: Function to check for stand-alone letters in a DataFrame column

def check_stand_alone_letters(preprocessed_df, column_name):
    stand_alone_letters = []
    for index, text in enumerate(preprocessed_df[column_name]):
        letters = re.findall(r'\b[A-Za-z]\b', text)
        if letters:
            stand_alone_letters.append((index, letters))
    return stand_alone_letters

# Test the check_stand_alone_letters function on 'clean_title' column
stand_alone_letters_title = check_stand_alone_letters(preprocessed_df, 'clean_title')
print("Stand-alone letters in 'clean_title' column:", stand_alone_letters_title)


Stand-alone letters in 'clean_title' column: [(266, ['n']), (437, ['n'])]


In [35]:
# Cell 2: Function to check for stand-alone letters in a DataFrame column

def check_stand_alone_letters(preprocessed_df, column_name):
    stand_alone_letters = []
    for index, text in enumerate(preprocessed_df[column_name]):
        letters = re.findall(r'\b[A-Za-z]\b', text)
        if letters:
            stand_alone_letters.append((index, letters))
    return stand_alone_letters

# Test the check_stand_alone_letters function on 'clean_title' column
stand_alone_letters_title = check_stand_alone_letters(preprocessed_df, 'clean_content')
print("Stand-alone letters in 'clean_content' column:", stand_alone_letters_title)


Stand-alone letters in 'clean_content' column: [(0, ['n', 'z', 'n', 'n', 'n', 'n', 'n', 'n', 'p', 'n', 'n']), (6, ['k']), (7, ['e']), (8, ['n', 'v']), (9, ['c', 'n', 'k', 'g', 's', 'n']), (14, ['o', 'o']), (15, ['o']), (24, ['n']), (25, ['n']), (29, ['n', 'n']), (35, ['s', 'e', 'n']), (42, ['a', 'e']), (45, ['l']), (47, ['l']), (48, ['g']), (55, ['n']), (58, ['r']), (63, ['n']), (72, ['n']), (74, ['n', 'n']), (75, ['y']), (79, ['n']), (80, ['e']), (83, ['l', 'r']), (87, ['e', 'v']), (88, ['n']), (89, ['s', 'i']), (90, ['e']), (94, ['n']), (99, ['e']), (102, ['n']), (107, ['e']), (109, ['n', 'n', 'e', 'e', 'f', 'e', 'f']), (111, ['e']), (123, ['n']), (125, ['e', 'e', 'i']), (135, ['e']), (139, ['n']), (146, ['e', 'd', 'n']), (147, ['n', 'n', 'n']), (151, ['n']), (153, ['n', 'n', 'c', 'z']), (154, ['n']), (159, ['t', 'e']), (160, ['p']), (161, ['n', 'n']), (170, ['n']), (174, ['e', 'd']), (178, ['n']), (182, ['n', 'n', 'n', 'm']), (183, ['n']), (187, ['n']), (188, ['k']), (190, ['p', 'n'

In [32]:
# Cell 3: Function to check for special characters in a DataFrame column

def check_special_characters(preprocessed_df, column_name):
    special_characters = []
    for index, text in enumerate(preprocessed_df[column_name]):
        characters = re.findall(r'[^\w\s]', text)
        if characters:
            special_characters.append((index, characters))
    return special_characters

# Test the check_special_characters function on 'clean_content' column
special_characters_content = check_special_characters(preprocessed_df, 'clean_content')
print("Special characters in 'clean_content' column:", special_characters_content)


Special characters in 'clean_content' column: []


In [34]:
# Cell 3: Function to check for special characters in a DataFrame column

def check_special_characters(df, column_name):
    special_characters = []
    for index, text in enumerate(df[column_name]):
        characters = re.findall(r'[^\w\s]', text)
        if characters:
            special_characters.append((index, characters))
    return special_characters

# Test the check_special_characters function on 'clean_content' column
special_characters_content = check_special_characters(preprocessed_df, 'clean_title')
print("Special characters in 'clean_title' column:", special_characters_content)


Special characters in 'clean_title' column: []


In [42]:
print("Exact row for 'clean_title' column:")
print(preprocessed_df.loc[266, 'clean_title'])

Exact row for 'clean_title' column:
blonde wervelwind


In [43]:
print("Exact row for 'Titel' column:")
print(preprocessed_df.loc[266, 'Titel'])

Exact row for 'Titel' column:
DE BLONDE WERVELWIND
