In [4]:
from datasets import load_dataset

def display_row_with_columns(dataset, row_index):
    """
    Displays a specific row from a Hugging Face dataset with column names and their corresponding values.

    Parameters:
        dataset (Dataset): The Hugging Face dataset to display the row from.
        row_index (int): The index of the row to display.

    Returns:
        None
    """
    try:
        # Ensure the row index is valid
        if row_index < 0 or row_index >= len(dataset):
            print(f"Error: Row index {row_index} is out of range. Valid range is 0 to {len(dataset) - 1}.")
            return

        # Get the specific row
        row = dataset[row_index]

        # Display column names and their values
        print(f"Row {row_index}:")
        for column, value in row.items():
            print(f"{column}: {value}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Load the dataset
dataset = load_dataset("Jinyan1/COLING_2025_MGT_multingual")['train']

# Display a specific row
row_index = 23  # Change this to display a different row
display_row_with_columns(dataset, row_index)


Row 23:
id: 4af33192-6742-4694-b53a-bc2452697b81
source: mage
sub_source: hswag
lang: en
model: opt_350m
label: 1
text: [step] Don't just sign up just to be on the team. It doesn't matter about your skill, but your effort and willingness to contribute is more important. "Anyone who signed up for the draft was supposed to play a role in our success." If you're willing to give up your time, pay attention to what you do need help with at the end of each week instead of trying to win games over the weekend. 4 players to watch during Week 5 against the Rams: Jalen Ramsey (6-foot-1) vs. Matt Ryan (6-3), Bruce Davis (5-11), Derrick Henry (7-0), Tim Tebow (10). The Cowboys will have a lot of receivers battle it out between now and Sunday's showdown versus the Chargers - and they can ask several questions about their depth chart if they want to know where the team went wrong Wednesday night or Thursday morning.


In [49]:
import pandas as pd

def display_row_with_columns(csv_file, row_index):
    """
    Displays a specific row from a CSV file with column names and their corresponding values.

    Parameters:
        csv_file (str): The path to the CSV file.
        row_index (int): The index of the row to display.

    Returns:
        None
    """
    try:
        # Load the CSV file using pandas
        df = pd.read_csv(csv_file)

        # Ensure the row index is valid
        if row_index < 0 or row_index >= len(df):
            print(f"Error: Row index {row_index} is out of range. Valid range is 0 to {len(df) - 1}.")
            return

        # Get the specific row
        row = df.iloc[row_index]

        # Display column names and their values
        print(f"Row {row_index}:")
        for column, value in row.items():
            print(f"{column}: {value}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Path to the CSV file
csv_file = "sampled_dataset.csv"  # Change this to the path of your CSV file

# Display a specific row
row_index = 11030
display_row_with_columns(csv_file, row_index)


Row 11030:
id: 11622a65-65ba-48d0-ae1b-ce9f8328c7bc
source: m4gt
sub_source: News/Wikipedia
lang: de
model: gpt-3.5-turbo
label: 1
text: Die New Yorker Aktienbörsen haben ihre Verlustserie der vergangenen Tage fortgesetzt und die Sitzung am Freitag erneut tief im Minus beendet. Weiterhin lasteten vor allem die Sorgen um eine drohende Abkühlung der Weltwirtschaft auf den Notierungen. Diese Befürchtungen wurden zum Wochenschluss von schwachen Stimmungsdaten aus der chinesischen Industrie zusätzlich angeheizt. Der Dow Jones brach um satte 530,94 Einheiten oder 3,12 Prozent auf 16.459,75 Zähler ein. Der Leitindex schloss damit den vierten Tag in Folge im roten Bereich. Auf Wochensicht beläuft sich das Minus auf mehr als fünf Prozent, damit verbuchte der Dow seinen größten Wochenverlust seit Herbst 2011. Der 500 ausgewählte US-Unternehmen fassende S&P-500 Index rutschte um 64,84 Punkte (minus 3,19 Prozent) auf 1.970,89 Zähler ab. Das Börsenbarometer ist erstmals seit Februar wieder unter di

In [50]:
import pandas as pd

def save_to_separate_files(csv_file, label_column="label"):
    """
    Saves rows from the CSV file into two separate files based on label values (1 and 0).
    Rows with label 1 will be saved in one file, and rows with label 0 will be saved in another.

    Parameters:
        csv_file (str): The path to the CSV file.
        label_column (str): The column name that contains the label (default is "label").
        
    Returns:
        None
    """
    try:
        # Load the CSV file using pandas
        df = pd.read_csv(csv_file)

        # Filter rows where the label is 1 and 0
        df_label_1 = df[df[label_column] == 1]
        df_label_0 = df[df[label_column] == 0]

        # Save these filtered DataFrames to separate CSV files
        df_label_1.to_csv('label_1_data.csv', index=False)
        df_label_0.to_csv('label_0_data.csv', index=False)

        print("Data has been saved to 'label_1_data.csv' and 'label_0_data.csv'.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Path to the CSV file
csv_file = "sampled_dataset.csv"  # Change this to the path of your CSV file

# Call the function to save data into separate files based on label
save_to_separate_files(csv_file)


Data has been saved to 'label_1_data.csv' and 'label_0_data.csv'.


In [51]:
import pandas as pd

In [52]:
df1 = pd.read_csv("label_1_data.csv")
df1.head()

Unnamed: 0,id,source,sub_source,lang,model,label,text
0,374c60d2-dd43-4b61-9ea1-d1a8fa309624,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,أشعل اغتيال قائد فيلق القدس في الحرس الثوري ال...
1,a4e44d93-be72-41ca-84b9-b3e83ecf46ed,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,في محاولة لكسب دعم الناخبين الإنجيليين في الان...
2,ff3bfdb9-5f8c-4315-93c6-28e200fa5892,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,جبال الأبالاش (بالإنجليزية: Appalachian Mounta...
3,ac907968-8c7b-4493-bb0e-feed48581588,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,أكدت مصادر أمنية مصرية أن الجهات الأمنية التاب...
4,2bcbb1c1-c5d9-42e8-9fe5-281d8d0edf0a,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,أكدت الكويت رفضها التام لاستخدام أراضيها في أي...


In [57]:
df1.shape

(29495, 7)

In [53]:
df2 = pd.read_csv("label_0_data.csv")
df2.head()

Unnamed: 0,id,source,sub_source,lang,model,label,text
0,04461542-633c-415a-854c-f5dcfd8240bd,m4gt,News/Wikipedia,ar,human,0,الفايكنغ أو الوِيكنجار (بالنوردية القديمة: vík...
1,d84ab860-d667-49a7-ab01-6b5155e62106,m4gt,News/Wikipedia,ar,human,0,الروبوتية أو الإنساليات أو علم الروبوتات (بالإ...
2,f2d8893b-e625-45ff-b918-5a2bae5bed6c,m4gt,News/Wikipedia,ar,human,0,غازبروم (Открытое Акционерное Общество «Газпро...
3,990ffe5d-1c1a-49f9-928d-c75668b6cd53,m4gt,News/Wikipedia,ar,human,0,أطلقت الهند الحاسوب الدفتري المحمول «ساكاشات» ...
4,4ad3adde-b7e0-4fce-98ef-db8ccac780c9,m4gt,News/Wikipedia,ar,human,0,إيديل (باللاتينية: Aedilis، وهي مشتقَّة من aed...


In [56]:
df2.shape

(18628, 7)

In [55]:
import pandas as pd

def display_row_with_columns(csv_file, row_index):
    """
    Displays a specific row from a CSV file with column names and their corresponding values.

    Parameters:
        csv_file (str): The path to the CSV file.
        row_index (int): The index of the row to display.

    Returns:
        None
    """
    try:
        # Load the CSV file using pandas
        df = pd.read_csv(csv_file)

        # Ensure the row index is valid
        if row_index < 0 or row_index >= len(df):
            print(f"Error: Row index {row_index} is out of range. Valid range is 0 to {len(df) - 1}.")
            return

        # Get the specific row
        row = df.iloc[row_index]

        # Display column names and their values
        print(f"Row {row_index}:")
        for column, value in row.items():
            print(f"{column}: {value}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Path to the CSV file
csv_file = "label_0_data.csv"  # Change this to the path of your CSV file

# Display a specific row
row_index = 500
display_row_with_columns(csv_file, row_index)


Row 500:
id: ad7cde86-ef38-46f2-824c-10f6c3bc068b
source: m4gt
sub_source: True & Fake News
lang: bg
model: human
label: 0
text: Снимка: РБ София. Няма оставки в ДБГ, а в Реформаторския блок, защото на тези избори се яви Блокът, а не партиите поотделно. Това каза пред журналисти заместник-председателят на ДБГ Найден Зеленогорски преди заседанието Национален координационен съвет на Реформаторския блок, предаде репортер на Агенция „Фокус“. Попитан дали лидерът на ДБГ Меглена Кунева трябва да последва примера на Радан Кънев, Зеленогорски посочи, че тя първа е дала пример, защото не е участвала в изборите. „Най-ясното е, че 300 хил. десни избиратели не са представени“, каза той. По думите му ще се научи след анализите какви конкретни грешки има всяка част от разстроеното дясно пространство. Според него най-логично е да бъде приета оставката на политическия съвет на РБ. „Като цяло моето лично мнение е, че Блокът и формацията, в този вид, в който се яви на изборите би трябвало да бъде съхран