In [1]:
#!pip install pandas spacy swifter
import pandas as pd
import spacy
import swifter
import ast
import json
import numpy as np

In [2]:
#python -m spacy download da_core_news_sm
nlp = spacy.load("da_core_news_sm")

def split_paragraph(paragraph: str):
    doc = nlp.pipe([paragraph], batch_size=1, n_process=1)
    return [sent.text.strip() for sent in list(doc)[0].sents]


def danish_sentences_extraction(data, text_column):

    sentences = {}

    for para in data.index:
        text_sentences = data.loc[para][text_column]
        
        blame_sentence_dict = {}
        for indx in range(len(text_sentences)):
            blame_sentence = text_sentences[indx]
            blame_sentence_dict[indx] = blame_sentence
        
        
        sentences[para] = blame_sentence_dict

    return sentences


#------------------------------------------#
def convert_to_json_and_write(file_name, sentences):
    with open(f'/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json', 'w') as file:
        file.write(json.dumps(sentences, indent=4))



    #preprocess data for label studio
    # Load your data
    with open(f'/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json', "r", encoding="utf-8") as f:
        data = json.load(f)

    flattened = []

    for paragraph, sentences in data.items():
        for sentence_nr, text in sentences.items():
            flattened.append({
                "paragraph": paragraph,
                "sentence_nr": sentence_nr,
                "text": text
            })

    # Save in a format Label Studio can import
    with open(f'/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json', "w", encoding="utf-8") as f:
        json.dump(flattened, f, ensure_ascii=False, indent=2)

    return


#final data is csv file as output of PolDebate model

def json_append_meta_data(file_name, data):
    meta_data = data[['Unnamed: 0','speaker','party']]
    meta_data = meta_data.replace({np.nan: None})
    #meta_data.head()


    #connect label studio data with meta data


    # Load the flattened sentence JSON
    with open(f"/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json", "r", encoding="utf-8") as f:
        sentences = json.load(f)

    # Load metadata
    meta = meta_data

    # Convert metadata to dict for fast lookup
    meta_dict = meta.set_index("Unnamed: 0").to_dict(orient="index")

    # Merge
    for item in sentences:
        paragraph = int(item["paragraph"])
        if paragraph in meta_dict:

            item.update(meta_dict[paragraph])

    # Save merged dataset
    with open(f'/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json', "w", encoding="utf-8") as f:
        json.dump(sentences, f, ensure_ascii=False, indent=2)


    return




#find parties in government by dataset
def find_government(gov_data, date):
    match = gov_data[(gov_data["Start Date"] <= date) & (gov_data["End Date"] >= date)]
    if not match.empty:

        parties = match['Party Letter']
        parties = ast.literal_eval(parties.iloc[0])
        
        return parties
    else:
        print('empty')
        return None


#append context and government related data to the json file
def json_government_and_context(file_name, data, government_data):

    # Load the flattened sentence JSON
    with open(f"/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json", "r", encoding="utf-8") as f:
        json_data = json.load(f)

    for entry in json_data:
        paragraph_nr = int(entry['paragraph'])
        sentence_nr = int(entry['sentence_nr'])

        #initialize information on current paragraph
        temp_data = data.loc[paragraph_nr]

        #Find out if speaker is member of party in Government
        party = entry['party']
        date = temp_data['date']
        parties_gov = find_government(government_data, date)
        if party in parties_gov:
            in_gov = True
        else:
            in_gov = False
        
        #make into dict

        context_dict = {'current_speaker_in_government': in_gov,
                        'parties_in_government': parties_gov,
                        'date': str(date)
                        }

        #update
        entry.update(context_dict)

    #save data
    with open(f"/work/MarkusLundsfrydJensen#1865/inferece_data/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    return


import json

def preprocess_json(input_path, output_path=None):
    """
    Preprocesses a JSON file by filtering out entries based on the 'text' key.
    
    Criteria for deletion:
      - 'text' is missing or empty
      - 'text' length is <= 3
      - 'text' contains '(' or ')'
    
    Parameters:
        input_path (str): Path to the input JSON file.
        output_path (str, optional): If provided, saves the filtered JSON here.
    
    Returns:
        list: The filtered list of JSON entries.
    """
    # Load JSON file
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Filter entries
    filtered_data = [
        entry for entry in data
        if 'text' in entry
        and entry['text']
        and len(entry['text']) > 3
        and '(' not in entry['text']
        and ')' not in entry['text']
    ]

    # Optionally save to a new file
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_data, f, ensure_ascii=False, indent=4)

    return filtered_data

    


In [3]:
#Fist create inference dataset

df = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/Corp_Folketing_V2.csv")

# Convert the column to datetime (specify format if necessary)
df["date"] = pd.to_datetime(df["date"], errors='coerce', dayfirst=True)

# Filter rows after January 1, 2000
filtered_df = df[df["date"] > "2000-01-01"]

#filtered_df.pop("Unnamed: 0")

filtered_df.head()

Unnamed: 0.1,Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
36314,36314,2000-11-01,1) Spørgsmål til ministrene.,2,Henning Grove,,,True,181,Til at besvare spørgsmål i spørgetimen i dag h...,DK-Folketing,DNK
36315,36315,2000-11-01,1) Spørgsmål til ministrene.,3,Frank Dahlgaard,UP,,False,187,Ministeren er jo ikke bare minister for fødeva...,DK-Folketing,DNK
36316,36316,2000-11-01,1) Spørgsmål til ministrene.,4,Henrik Dam Kristensen,S,379.0,False,175,Først vil jeg gerne sige til hr. Frank Dahlgaa...,DK-Folketing,DNK
36317,36317,2000-11-01,1) Spørgsmål til ministrene.,5,Frank Dahlgaard,UP,,False,173,"Ja, vi har en stor eksport, men ministeren er ...",DK-Folketing,DNK
36318,36318,2000-11-01,1) Spørgsmål til ministrene.,6,Henrik Dam Kristensen,S,379.0,False,138,"Hr. Frank Dahlgaard har jo fuldstændig ret i, ...",DK-Folketing,DNK


In [None]:
import ipywidgets
filtered_df['segmented_text'] = filtered_df['text'].swifter.apply(split_paragraph)



Pandas Apply:   0%|          | 0/293615 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['segmented_text'] = filtered_df['text'].swifter.apply(split_paragraph)


In [None]:
#extract dish sentences and make into json 
# format using modified function and inspiration from prelim blame detection

In [None]:
all_sentences = danish_sentences_extraction(filtered_df, text_column = 'segmented_text')

#all_sentences

20
6
6
6
4
3


TypeError: object of type 'float' has no len()

In [None]:

convert_to_json_and_write(file_name = 'inference_data',sentences = all_sentences)

In [None]:

json_append_meta_data(file_name = 'inference_data', data = filtered_df)

In [4]:
#get government data
regerings_data = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/danish_govs.csv")

#make date related columns into datetime objects
regerings_data["Start Date"] = pd.to_datetime(regerings_data["Start Date"], format="%Y-%m-%d")
regerings_data["End Date"]   = pd.to_datetime(regerings_data["End Date"], format="%Y-%m-%d")

In [5]:
json_government_and_context(file_name = 'inference_data', data = filtered_df, government_data = regerings_data)


In [6]:
#DO some cleanup

preprocess_json(input_path = "/work/MarkusLundsfrydJensen#1865/inferece_data/inference_data.json", output_path="/work/MarkusLundsfrydJensen#1865/inferece_data/preprocessed_inference_data.json")



[{'paragraph': '36314',
  'sentence_nr': '0',
  'text': 'Til at besvare spørgsmål i spørgetimen i dag har  statsministeren udpeget fødevareministeren.',
  'speaker': 'Henning Grove',
  'party': None,
  'current_speaker_in_government': False,
  'parties_in_government': ['S', 'RV'],
  'date': '2000-11-01 00:00:00'},
 {'paragraph': '36314',
  'sentence_nr': '1',
  'text': 'Der er indtil dette øjeblik tilmeldt fem hovedspørgere  til fødevareministeren til denne spørgetime.',
  'speaker': 'Henning Grove',
  'party': None,
  'current_speaker_in_government': False,
  'parties_in_government': ['S', 'RV'],
  'date': '2000-11-01 00:00:00'},
 {'paragraph': '36314',
  'sentence_nr': '2',
  'text': 'Er der flere,  der ønsker at tilmelde sig som hovedspørgere?',
  'speaker': 'Henning Grove',
  'party': None,
  'current_speaker_in_government': False,
  'parties_in_government': ['S', 'RV'],
  'date': '2000-11-01 00:00:00'},
 {'paragraph': '36314',
  'sentence_nr': '4',
  'text': 'Det  er ikke tilfælde