In [2]:
import pandas as pd
import os

def mergeCsv(output_file, *input_files):
    """
    Merges multiple CSV files into a single CSV file.

    Parameters:
        output_file (str): The name of the output CSV file.
        *input_files (str): Paths to the input CSV files to be merged.
    """
    # List to store DataFrames
    dataframes = []

    # Read each CSV file and append to the list
    for file in input_files:
        if os.path.exists(file):
            df = pd.read_csv(file)
            dataframes.append(df)
        else:
            print(f"File not found: {file}")

    # Concatenate all DataFrames
    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        # Save the merged DataFrame to the output file
        merged_df.to_csv(output_file, index=False)
        print(f"Merged CSV saved as: {output_file}")
    else:
        print("No valid files to merge.")


def countRow(input_file):
    """
    Counts the number of rows in a CSV file.

    Parameters:
        input_file (str): Path to the input CSV file.

    Returns:
        int: Number of rows in the CSV file.
    """
    if os.path.exists(input_file):
        df = pd.read_csv(input_file)
        row_count = len(df)
        print(f"Number of rows in {input_file}: {row_count}")
    else:
        print(f"File not found: {input_file}")

def checkDuplicate(file_path):
    """
    Checks for duplicate rows in a CSV file based on the 'Domain' column.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Check for duplicates based on the 'Domain' column
        duplicates = df[df.duplicated(subset='Domain', keep=False)]

        # Print the duplicates if any
        if not duplicates.empty:
            print(f"Found {len(duplicates)} duplicate rows based on the 'Domain' column:")
        else:
            print("No duplicates found based on the 'Domain' column.")
    else:
        print(f"File not found: {file_path}")

def removeDuplicate(file_path, output_file):
    """
    Removes duplicate rows in a CSV file based on the 'Domain' column, keeping the first occurrence.

    Parameters:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the deduplicated CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Remove duplicates based on the 'Domain' column, keeping the first occurrence
        deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')

        # Save the deduplicated DataFrame to a new file
        deduplicated_df.to_csv(output_file, index=False)
        print(f"Duplicates removed. Deduplicated file saved as: {output_file}")
    else:
        print(f"File not found: {file_path}")




In [3]:
mergeCsv(
    "NOSFT_netpro_raw_7k_train_label.csv",
    "NOSFT_netpro_raw_7k_train_label0-1043.csv",
    "NOSFT_netpro_raw_7k_train_label1044-7244.csv",
)

Merged CSV saved as: NOSFT_netpro_raw_7k_train_label.csv


In [2]:
mergeCsv(
    "./labelling_result/2604/processed_2604/merged_fixed_2604.csv",
    "./labelling_result/2604/label_2604/Fixed_-1_0-1095.csv",
    "./labelling_result/2604/label_2604/Fixed_-2_0-339.csv",
    "./labelling_result/2604/label_2604/Fixed_-2_340-347.csv",
    "./labelling_result/2604/label_2604/Fixed_-3_0.csv",
    "./labelling_result/2604/label_2604/Fixed_-4_0-431.csv",
    "./labelling_result/2604/label_2604/Fixed_-5_0-78.csv",
    "./labelling_result/2604/label_2604/Fixed_-6_0-135.csv"
)
countRow("./labelling_result/2604/processed_2604/merged_fixed_2604.csv")

Merged CSV saved as: ./labelling_result/2604/processed_2604/merged_fixed_2604.csv
Number of rows in ./labelling_result/2604/processed_2604/merged_fixed_2604.csv: 2634


In [4]:
mergeCsv(
    "./labelling_result/2604/processed_2604/merged_labeled_2604.csv",
    "./labelling_result/2604/label_2604/Labeled_-1_0-1095.csv",
    "./labelling_result/2604/label_2604/Labeled_-2_0-339.csv",
    "./labelling_result/2604/label_2604/Labeled_-2_340-347.csv",
    "./labelling_result/2604/label_2604/Labeled_-3_0.csv",
    "./labelling_result/2604/label_2604/Labeled_-4_0-431.csv",
    "./labelling_result/2604/label_2604/Labeled_-5_0-78.csv",
    "./labelling_result/2604/label_2604/Labeled_-6_0-135.csv"
)
countRow("./labelling_result/2604/processed_2604/merged_labeled_2604.csv")

Merged CSV saved as: ./labelling_result/2604/processed_2604/merged_labeled_2604.csv
Number of rows in ./labelling_result/2604/processed_2604/merged_labeled_2604.csv: 2634


In [5]:
# Process `cs_54k_filtered_fixed` files
checkDuplicate("./labelling_result/2604/processed_2604/merged_labeled_2604.csv")
removeDuplicate("./labelling_result/2604/processed_2604/merged_labeled_2604.csv", "./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv")
countRow("./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv")
checkDuplicate("./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv")

No duplicates found based on the 'Domain' column.
Duplicates removed. Deduplicated file saved as: ./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv
Number of rows in ./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv: 2634
No duplicates found based on the 'Domain' column.


In [7]:
# Process `cs_54k_filtered_fixed` files
checkDuplicate("./labelling_result/2604/processed_2604/merged_fixed_2604.csv")
removeDuplicate("./labelling_result/2604/processed_2604/merged_fixed_2604.csv", "./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv")
countRow("./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv")
checkDuplicate("./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv")

No duplicates found based on the 'Domain' column.
Duplicates removed. Deduplicated file saved as: ./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv
Number of rows in ./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv: 2634
No duplicates found based on the 'Domain' column.


In [6]:
# Process `cs_54k_filtered_fixed` files
checkDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930.csv")
removeDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930.csv", "./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv")
countRow("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv")
checkDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv")

# Process `cs_54k_filtered_label` files
checkDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930.csv")
removeDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930.csv", "./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv")
countRow("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv")
checkDuplicate("./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv")

# Process `cc_260k_fixed` files
checkDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619.csv")
removeDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619.csv", "./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv")
countRow("./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv")
checkDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv")

# Process `cc_260k_labelled` files
checkDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619.csv")
removeDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619.csv", "./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619_dedup.csv")
countRow("./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619_dedup.csv")
checkDuplicate("./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619_dedup.csv")

Found 6 duplicate rows based on the 'Domain' column:
Duplicates removed. Deduplicated file saved as: ./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv
Number of rows in ./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv: 1522
No duplicates found based on the 'Domain' column.
Found 6 duplicate rows based on the 'Domain' column:
Duplicates removed. Deduplicated file saved as: ./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv
Number of rows in ./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv: 1522
No duplicates found based on the 'Domain' column.
No duplicates found based on the 'Domain' column.
Duplicates removed. Deduplicated file saved as: ./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv
Number of rows in ./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv: 17620
No duplicates found based on the 'Domain' column.
No dupl

In [13]:
mergeCsv(
    "./labelling_result/2204/processed_2204/merged_label_2204.csv",
    "./labelling_result/2204/cc_260k_2204/cc_260k_labelled_100000-117619_dedup.csv",
    "./labelling_result/2204/cs_54k_2204/cs_54k_filtered_label_39406-40930_dedup.csv"
)

mergeCsv(
    "./labelling_result/2204/processed_2204/merged_fixed_2204.csv",
    "./labelling_result/2204/cc_260k_2204/cc_260k_fixed_100000-117619_dedup.csv",
    "./labelling_result/2204/cs_54k_2204/cs_54k_filtered_fixed_39406-40930_dedup.csv"
)

Merged CSV saved as: ./labelling_result/2204/processed_2204/merged_label_2204.csv
Merged CSV saved as: ./labelling_result/2204/processed_2204/merged_fixed_2204.csv


In [14]:
countRow("./labelling_result/2204/processed_2204/merged_label_2204.csv")
checkDuplicate("./labelling_result/2204/processed_2204/merged_label_2204.csv")
countRow("./labelling_result/2204/processed_2204/merged_fixed_2204.csv")
checkDuplicate("./labelling_result/2204/processed_2204/merged_fixed_2204.csv")

Number of rows in ./labelling_result/2204/processed_2204/merged_label_2204.csv: 19142
No duplicates found based on the 'Domain' column.
Number of rows in ./labelling_result/2204/processed_2204/merged_fixed_2204.csv: 19142
No duplicates found based on the 'Domain' column.


In [2]:

def load_and_print_all_columns(file_path):
    """
    Load a CSV file and print all columns.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(df.columns.tolist())
    else:
        print(f"File not found: {file_path}")
load_and_print_all_columns("./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv")
load_and_print_all_columns("./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv")

['Domain', 'Content', 'Label', 'Confidence']
['Domain', 'Answer', 'Classification', 'Reason', 'Confidence', 'Thought']


In [17]:

# Load the CSV file
file_path = "./labelling_result/2204/processed_2204/merged_label_2204.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path)

    # Rename the 'Answer' column to 'Label'
    if 'Answer' in df.columns:
        df.rename(columns={'Answer': 'Label'}, inplace=True)

        # Save the updated DataFrame back to the file
        df.to_csv(file_path, index=False)
        print(f"Column 'Answer' has been renamed to 'Label' and saved to {file_path}")
    else:
        print("The 'Answer' column is not present in the file.")
else:
    print(f"File not found: {file_path}")

Column 'Answer' has been renamed to 'Label' and saved to ./labelling_result/2204/processed_2204/merged_label_2204.csv


In [11]:
import pandas as pd
import os

label_file = "./labelling_result/2604/processed_2604/merged_fixed_2604_dedup.csv"
fixed_file = "./labelling_result/2604/processed_2604/merged_labeled_2604_dedup.csv"
output_file = "./labelling_result/2604/processed_2604/merged_combined_2604.csv"

if os.path.exists(fixed_file) and os.path.exists(label_file):
    # Load both CSV files into DataFrames
    df_fixed = pd.read_csv(fixed_file)
    df_label = pd.read_csv(label_file)

    # Strip paragraph spacing (\n) from the 'Thought' column in df_label
    if 'Thought' in df_label.columns:
        df_label['Thought'] = df_label['Thought'].str.replace('\n', ' ', regex=False).str.strip()

    # Merge the DataFrames on 'Domain', 'Label', and 'Confidence'
    merged_df = pd.merge(df_fixed, df_label, on=['Domain'])
    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged CSV saved as: {output_file}")
else:
    print("One or both input files are missing.")

Merged CSV saved as: ./labelling_result/2604/processed_2604/merged_combined_2604.csv


In [12]:
countRow("./labelling_result/2604/processed_2604/merged_combined_2604.csv")
checkDuplicate("./labelling_result/2604/processed_2604/merged_combined_2604.csv")
load_and_print_all_columns("./labelling_result/2604/processed_2604/merged_combined_2604.csv")

Number of rows in ./labelling_result/2604/processed_2604/merged_combined_2604.csv: 2634
No duplicates found based on the 'Domain' column.
['Domain', 'Answer', 'Classification', 'Reason', 'Confidence_x', 'Thought', 'Content', 'Label', 'Confidence_y']


In [14]:
file_path = "./labelling_result/2604/processed_2604/merged_combined_2604.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Filter rows where Confidence_x does not equal Confidence_y
    mismatched_confidence = df[df['Confidence_x'] != df['Confidence_y']]
    
    # Print the filtered rows
    print("Rows where Confidence_x does not equal Confidence_y:")
    print(mismatched_confidence)
else:
    print(f"File not found: {file_path}")


if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Drop the Confidence_y column
    if 'Confidence_y' in df.columns:
        df.drop(columns=['Confidence_y'], inplace=True)
    
    # Rename Confidence_x to Confidence
    if 'Confidence_x' in df.columns:
        df.rename(columns={'Confidence_x': 'Confidence'}, inplace=True)
    
    # Save the updated DataFrame back to the file
    df.to_csv(file_path, index=False)
    print(f"Updated file saved successfully: {file_path}")
else:
    print(f"File not found: {file_path}")

Rows where Confidence_x does not equal Confidence_y:
Empty DataFrame
Columns: [Domain, Answer, Classification, Reason, Confidence_x, Thought, Content, Label, Confidence_y]
Index: []
Updated file saved successfully: ./labelling_result/2604/processed_2604/merged_combined_2604.csv


In [17]:
load_and_print_all_columns("./labelling_result/2604/processed_2604/merged_combined_2604.csv")

['Domain', 'Answer', 'Classification', 'Reason', 'Confidence', 'Thought', 'Content', 'Label']


In [18]:
file_path = "./labelling_result/2604/processed_2604/merged_combined_2604.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Drop the 'Answer' column
    if 'Answer' in df.columns:
        df.drop(columns=['Answer'], inplace=True)
    
    # Rearrange the columns
    column_order = ['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
    df = df[column_order]
    
    # Save the updated DataFrame back to the file
    df.to_csv(file_path, index=False)
    print(f"Updated file saved successfully: {file_path}")
else:
    print(f"File not found: {file_path}")

Updated file saved successfully: ./labelling_result/2604/processed_2604/merged_combined_2604.csv


In [22]:
df = pd.read_csv("./labelling_result/2204/processed_2204/merged_combined_2204.csv")

# Drop Label_x and Confidence_x
df.drop(columns=['Label_x', 'Confidence_x'], inplace=True)

# Rename Label_y and Confidence_y to Label and Confidence
df.rename(columns={'Label_y': 'Label', 'Confidence_y': 'Confidence'}, inplace=True)

# Save the updated DataFrame back to the file
df.to_csv("./labelling_result/2204/processed_2204/merged_combined_2204.csv", index=False)
print(f"Updated ./labelling_result/2204/processed_2204/merged_combined_2204.csv saved successfully.")

Updated ./labelling_result/2204/processed_2204/merged_combined_2204.csv saved successfully.


In [15]:

def analyzeContentLength(file_path):
    """
    Analyzes the length of the 'Content' column in a CSV file and prints statistics.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        # Load the CSV file
        df = pd.read_csv(file_path)

        # Check if 'Content' column exists
        if 'Content' in df.columns:
            # Calculate the length of the 'Content' column
            df['Content_Length'] = df['Content'].apply(len)

            # Print the content lengths
            print("Content lengths:")
            print(df['Content_Length'])

            # Calculate statistics
            average_length = df['Content_Length'].mean()
            max_length = df['Content_Length'].max()

            # Print statistics
            print(f"\nStatistics:")
            print(f"Average Content Length: {average_length}")
            print(f"Highest Content Length: {max_length}")
        else:
            print("The 'Content' column is not present in the file.")
    else:
        print(f"File not found: {file_path}")

# Example usage
analyzeContentLength("./labelling_result/2604/processed_2604/merged_combined_2604.csv")

Content lengths:
0       10008
1       10008
2        3569
3        3927
4       10008
        ...  
2629    10008
2630     1889
2631     2164
2632     2482
2633    10008
Name: Content_Length, Length: 2634, dtype: int64

Statistics:
Average Content Length: 5625.039104024298
Highest Content Length: 10008


In [24]:
def limit_content_length(content: str, max_content_length: int = 10000) -> str:
    """
    Limit the length of the 'Content' to a maximum length while preserving context.

    Parameters:
        content (str): The original content string.
        max_content_length (int): The maximum allowed length for the content.

    Returns:
        str: The content trimmed to fit within the maximum length.
    """
    if len(content) <= max_content_length:
        return content

    # Calculate balanced chunks
    start_len = int(max_content_length * 0.2)  # 20% for the start
    end_len = int(max_content_length * 0.2)    # 20% for the end
    mid_len = max_content_length - (start_len + end_len)  # Remaining for the middle

    start = content[:start_len]
    mid_point = len(content) // 2
    mid = content[mid_point - mid_len // 2:mid_point + mid_len // 2]
    end = content[-end_len:]

    return f"{start}... {mid}... {end}"

def limit_content_in_csv(file_path, output_file, max_content_length=10000):
    """
    Load a CSV file, limit the length of the 'Content' column to a maximum length,
    and save the updated CSV to a new file.

    Parameters:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the updated CSV file.
        max_content_length (int): Maximum allowed length for the 'Content' column.

    Returns:
        None
    """
    if os.path.exists(file_path):
        # Load the CSV file
        df = pd.read_csv(file_path)

        # Check if 'Content' column exists
        if 'Content' in df.columns:
            # Apply the limit_content_length function to the 'Content' column
            df['Content'] = df['Content'].apply(lambda x: limit_content_length(x, max_content_length))

            # Save the updated DataFrame to a new file
            df.to_csv(output_file, index=False)
            print(f"Updated CSV with limited content saved as: {output_file}")
        else:
            print("The 'Content' column is not present in the file.")
    else:
        print(f"File not found: {file_path}")

# Example usage
limit_content_in_csv("./labelling_result/2204/processed_2204/merged_combined_2204.csv", "./labelling_result/2204/processed_2204/merged_combined_limited_2204.csv")

Updated CSV with limited content saved as: ./labelling_result/2204/processed_2204/merged_combined_limited_2204.csv


In [26]:
analyzeContentLength("./labelling_result/2204/processed_2204/merged_combined_limited_2204.csv")

Content lengths:
0          596
1         1177
2         2299
3         7481
4         5437
         ...  
19137    10008
19138      903
19139      341
19140     8787
19141     1736
Name: Content_Length, Length: 19142, dtype: int64

Statistics:
Average Content Length: 4316.510761675896
Highest Content Length: 10008


In [27]:
df = pd.read_csv("./labelling_result/2204/processed_2204/merged_combined_limited_2204.csv")
df.head()

Unnamed: 0,Domain,Content,Label,Classification,Reason,Confidence,Thought
0,http://www.gdql.org.cn,时代潮人 广东省归国华侨联合会 微信 | 手机版 | 无障碍 首页 省侨联概况 侨联章程 侨...,0,Benign,The domain 'gdql.org.cn' and content discuss t...,70,"Okay, I need to classify the given website int..."
1,http://www.gealh.com,"亚洲AV无码成人网站国产动漫,亚洲AV乱码一区二区三四五六,综合偷自拍亚洲乱中文字幕,亚洲女...",2,Pornography,Content includes explicit sexual terms like '成...,40,"Okay, I need to classify the website http://ww..."
2,http://www.geekiest.net,"Security Geekiest.Net Technology, Smartphones,...",0,Benign,Domain 'geekiest.net' and content focus on tec...,75,"Okay, I need to classify the website http://ww..."
3,http://www.georgegnall.com,"9713 Inaugural Way, Montgomery Village, MD 208...",0,Benign,Domain 'georgegnall.com' suggests a real estat...,55,"Okay, I need to classify the website http://ww..."
4,http://www.gisa.ru,"Геоинформационный портал Gisa.ru - ЗАО ""Навико...",0,Benign,Domain 'gisa.ru' and content discuss GIS updat...,40,"Okay, I need to classify the website http://ww..."


In [28]:
df.describe()

Unnamed: 0,Label,Confidence
count,19142.0,19142.0
mean,0.096698,81.478947
std,0.455077,21.228752
min,-1.0,0.0
25%,0.0,70.0
50%,0.0,90.0
75%,0.0,100.0
max,3.0,100.0


In [29]:
label_stats = df['Label'].value_counts().sort_index()

# Print the statistics
print("Occurrences of each value in the 'Label' column:")
print(label_stats)

Occurrences of each value in the 'Label' column:
Label
-1      177
 0    17778
 1      474
 2      585
 3      128
Name: count, dtype: int64


In [30]:
print(df[df['Confidence'] == 110])

Empty DataFrame
Columns: [Domain, Content, Label, Classification, Reason, Confidence, Thought]
Index: []


In [49]:
label3df = df[df['Label'] == 3]
label3df.to_csv("merged_fixed_limited_label3.csv", index=False)

In [52]:
filtered_data = df[(df['Content'].str.contains("togel", case=False, na=False)) & (df['Label'].isin([0]))]

# Print the filtered rows
print(filtered_data)

                                Domain  \
1011                4frontierscorp.com   
4682                        001xxx.com   
5751                 bloggerjateng.com   
7152                 prediksitogel.net   
10310       jilbaberketat.blogspot.com   
...                                ...   
42822  https://waynes-color-centre.com   
53494           http://powergamer.info   
55436       https://www.deksomboon.com   
57305         https://iki-ichifuji.com   
70850           https://search.arch.be   

                                                 Content  Label  Confidence  
1011   4 Frontiers Corp HOME Educators Company Media ...      0          80  
4682   Skip to content My Blog Sample Page Homeformat...      0          90  
5751   Skip to content BLOGGER JATENG tempat kumpul b...      0          80  
7152   Excellent 4.6 out of 5 Trustpilot The domain n...      0          65  
10310  TitleBlog Kumpulan Video Jilboobs EXCLUSIVE 20...      0          60  
...                        

In [53]:
filtered_data.to_csv("merged_fixed_limited_togel.csv", index=False)

In [6]:
countRow("./labelling_result/2604/processed_2604/merge_combined_93k.csv")
checkDuplicate("./labelling_result/2604/processed_2604/merge_combined_93k.csv")
load_and_print_all_columns("./labelling_result/2604/processed_2604/merge_combined_93k.csv")

Number of rows in ./labelling_result/2604/processed_2604/merge_combined_93k.csv: 93096
Found 2946 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [7]:
import pandas as pd
import os

# Load the CSV file
file_path = "./labelling_result/2604/processed_2604/merge_combined_93k.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Find duplicate rows based on the 'Domain' column
    duplicates = df[df.duplicated(subset='Domain', keep=False)]
    
    # Print the duplicate rows
    if not duplicates.empty:
        print("Duplicate rows found based on the 'Domain' column:")
        print(duplicates)
    else:
        print("No duplicate rows found based on the 'Domain' column.")
else:
    print(f"File not found: {file_path}")

Duplicate rows found based on the 'Domain' column:
                                                  Domain  \
364                                       xhamster7.desi   
483               www.redhat.com/advice/ask_pbrown1.html   
582                                         bucetas.blog   
903                                 apartamentoclick.com   
1614   www.wired.com/techbiz/it/magazine/16-10/mf_chrome   
...                                                  ...   
92705                       www.patronsoft.com/firstspot   
92706              angelsan.free.fr/blog.php?page_id=542   
92707                             pinardslandscaping.com   
92709                          kcs365online.blogspot.com   
93029                                     mainskor88.lol   

                                                 Content  Label  \
364    Gay • SG Živý Sex Prémiová Videa Pornohvězdy K...      2   
483    user account menu log in new developments at o...      0   
582    Bucetas.blog OS MELH

In [12]:
import pandas as pd
import os

# Load the CSV file
file_path = "./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv"
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    
    # Find duplicate rows based on the 'Domain' column
    duplicates = df[df.duplicated(subset='Domain', keep=False)]
    
    # Check if there are duplicates
    if not duplicates.empty:
        print("Duplicate rows found based on the 'Domain' column:")
        
        # Sort duplicates by 'Domain' to ensure proper grouping
        duplicates = duplicates.sort_values(by='Domain')
        
        # Group by 'Domain' and filter where the 'Label' values are not the same
        mismatched_labels = duplicates.groupby('Domain').filter(
            lambda group: group['Label'].nunique() > 1
        )
        
        # Print the mismatched domains
        if not mismatched_labels.empty:
            print("Domains with mismatched labels:")
            print(mismatched_labels)
        else:
            print("No domains found where the labels differ between occurrences.")
    else:
        print("No duplicate rows found based on the 'Domain' column.")
else:
    print(f"File not found: {file_path}")

Duplicate rows found based on the 'Domain' column:
Domains with mismatched labels:
                               Domain  \
94026                      1stsex.com   
58269                      1stsex.com   
83565                    303cash.link   
58272                    303cash.link   
93039              abc-du-gratuit.net   
...                               ...   
58328  www.i-modernist.com/emulaxian/   
57604       www.simdesign.nl/fft.html   
93472       www.simdesign.nl/fft.html   
21810  www.smittyware.com/palm/upirc/   
58249  www.smittyware.com/palm/upirc/   

                                                 Content  Label  \
94026  Pusat Layanan Klien Bantuan Masuk Lanjut Mulai...      2   
58269  Pusat Layanan Klien Bantuan Masuk Lanjut Mulai...      0   
83565  303CASH SITUS JUDI ONLINE UANG ASLI TERPERCAYA...      1   
58272  303CASH SITUS JUDI ONLINE UANG ASLI TERPERCAYA...      3   
93039  L'ABC du Gratuit...Pour trouver les meilleurs ...      0   
...                   

In [14]:
import pandas as pd
import os

# File paths
input_file = "./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv"
output_file = "./labelling_result/2604/processed_2604/duplicated3.csv"

# Check if the input file exists
if os.path.exists(input_file):
    # Load the CSV file
    df = pd.read_csv(input_file)
    
    # Drop duplicates based on the 'Domain' column, keeping the first occurrence
    deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')
    
    # Save the deduplicated DataFrame to a new file
    deduplicated_df.to_csv(output_file, index=False)
    print(f"First occurrences saved to: {output_file}")
else:
    print(f"File not found: {input_file}")

First occurrences saved to: ./labelling_result/2604/processed_2604/duplicated3.csv


In [None]:
countRow("./labelling_result/2604/processed_2604/duplicated3.csv")
checkDuplicate("./labelling_result/2604/processed_2604/duplicated3.csv")
load_and_print_all_columns("./labelling_result/2604/processed_2604/duplicated3.csv")

Number of rows in ./labelling_result/2604/processed_2604/duplicated3.csv: 93953
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [16]:
countRow("./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv")
checkDuplicate("./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv")
load_and_print_all_columns("./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv")

Number of rows in ./labelling_result/2604/processed_2604/merged_combined_dedup_final.csv: 96926
Found 4491 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
