Combine Postimees Articles to a Single File

In [None]:
import pandas as pd
import os

folder = "postimees_sections"
output_file = "postimees_combined.csv"

dfs = []

for file in os.listdir(folder):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(folder, file), usecols=["url", "hrefs", "content"])
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv(output_file, index=False)

print(f"Combined {len(dfs)} files into {output_file}")


Combine Delfi Articles to a Single File

In [None]:
from pathlib import Path

def combine_and_identify_unique_content(directories_with_files, output_file):
    combined_df = pd.DataFrame()

    for directory, files in directories_with_files.items():
        for file_name in files:
            file_path = Path(directory) / f"{file_name}.csv"
            if file_path.exists():
                df = pd.read_csv(file_path)
                combined_df = pd.concat([combined_df, df], ignore_index=True)

    duplicates = combined_df[combined_df.duplicated(subset='id', keep=False)]
    unique_content_duplicates = duplicates.groupby('id').filter(lambda x: not all(x.nunique() == 1))
    combined_df.drop_duplicates(subset='id', keep='first').to_csv(output_file, index=False)
    
    print("Duplicates with unique content:")
    print(unique_content_duplicates)

    return {
        "output_file": output_file,
        "total_rows": len(combined_df),
        "unique_rows": len(combined_df.drop_duplicates(subset='id', keep='first')),
        "duplicates_with_unique_content_count": len(unique_content_duplicates)
    }

directories_with_files = {
    "delfi_sections": ["arvamus", "eesti", "jalgpall", "kultuur", "maailm", "korvpall", "digi", "teadus"],
    "delfi_topics": ["koroona"]
}

result = combine_and_identify_unique_content(directories_with_files, "delfi_combined.csv")
result

Combine ERR, Postimees, Delfi Article Files

In [None]:
output_file = 'all_combined.csv'

dfs = []
postimees_df = pd.read_csv('postimees_combined.csv', usecols=["url", "hrefs", "content"])
delfi_df = pd.read_csv('delfi_combined.csv', usecols=["url", "hrefs", "content"])
err_df = pd.read_csv('err_100_000.csv', usecols=["url", "hrefs", "content"])

all_combined_df = pd.concat([postimees_df, delfi_df, err_df], ignore_index=True)
all_combined_df.to_csv(output_file, index=False)

In [None]:
# Add originalArticle column with value 1 to all
df = pd.read_csv("all_combined.csv")

df["originalArticle"] = 1
df.to_csv("all_combined.csv", index=False)

In [None]:
import pandas as pd
import re
from urlextract import URLExtract

def filter_hrefs(file_path, output_file):
    # Create an instance of URLExtract
    extractor = URLExtract()

    # Define regex patterns for valid URLs
    patterns = [
        re.compile(r'err\.ee/\d+'),  # Matches 'err.ee' followed by a numeric ID
        re.compile(r'postimees\.ee/\d+'),  # Matches 'postimees.ee' followed by a numeric ID
        re.compile(r'delfi\.ee/artikkel/\d+')  # Matches 'delfi.ee/artikkel/' followed by a numeric ID
    ]

    # Load the CSV file
    df = pd.read_csv(file_path, dtype={'hrefs': str})
    
    # Ensure only non-null and non-empty hrefs are processed
    df = df[df['hrefs'].notna() & (df['hrefs'] != '[]')]

    # Extract URLs from each entry in the 'hrefs' column and filter them
    def extract_and_filter_urls(text):
        urls = extractor.find_urls(text)
        # Filter URLs to include only those matching the specific regex patterns
        filtered_urls = []
        for url in urls:
            if any(pattern.search(url) for pattern in patterns):
                filtered_urls.append(url)
        return filtered_urls

    df['extracted_hrefs'] = df['hrefs'].apply(extract_and_filter_urls)

    # Filter out rows where no URLs were extracted
    df = df[df['extracted_hrefs'].map(len) > 0]

    # Select only the 'url' and 'extracted_hrefs' columns
    df = df[['url', 'extracted_hrefs']]
    
    # Save the processed DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    return df

# Example usage
filtered_df = filter_hrefs("all_combined.csv", "all_filtered.csv")
print(filtered_df)


Correct Delfi URLs

In [None]:
import pandas as pd

# Function to modify Delfi URLs
def modify_delfi_url(url):
    if "https://www.delfi.ee/" in url and "/artikkel/" not in url:
        parts = url.split("/")
        new_url = f"https://www.delfi.ee/artikkel/{parts[-1]}"
        return new_url
    return url

# Load the CSV file
df = pd.read_csv('all_filtered.csv')

# Modify the 'url' column
df['url'] = df['url'].apply(modify_delfi_url)

# Save the changes back to the CSV file
df.to_csv('all_filtered_modified_delfi.csv', index=False)

