## Main function

In [1]:
import pandas as pd
import re
from collections import Counter
from typing import List, Optional

sentence_splitter      = re.compile(r'(?<=[.!?])\s+')
# Remove any leading "در پردازش <digits>"
REMOVE_PROCESS         = re.compile(r'\bپردازش\s*\d+\s*')
#". خطای شماره <digits> [مرحله <digits>]"
REMOVE_ERROR_TAIL      = re.compile(r'\.\s*خطای شماره\s*\d+(?:\s*مرحله\s*\d+)?')
# Remove any digits (and intervening spaces) immediately before "("
REMOVE_LEADING_NUM_PAREN = re.compile(r'\b\d+\s*\(')
# Matches an X or x that is either at the start of the string or preceded by whitespace,
# and either at the end of the string or followed by whitespace.
REPLACE_x_WITH_EKES    = re.compile(r'\b[Xx]\b')

def preprocess_text(text: str) -> str:
    t = REMOVE_PROCESS.sub('', text)
    t = REMOVE_ERROR_TAIL.sub('', t)
    t = REMOVE_LEADING_NUM_PAREN.sub('(', t)
    t = REPLACE_x_WITH_EKES.sub("ایکس", t)
    return t.strip()

def delete_sentences_by_repetition(
    input_path: str,
    output_path: str,
    drop_columns: Optional[List[str]] = None,
    repetition_threshold: int = 50
) -> None:
    drop_columns = drop_columns or []

    # Load & drop
    df = pd.read_excel(input_path, dtype=str).fillna('')
    if drop_columns:
        df = df.drop(columns=drop_columns, errors='ignore')

    # Count cleaned sentences
    sentence_counts = Counter()
    for cell in df.values.flatten():
        cleaned = preprocess_text(str(cell))
        for sent in sentence_splitter.split(cleaned):
            s = sent.strip()
            if s:
                sentence_counts[s] += 1

    # Build a regex matching any over-threshold sentence
    over = [re.escape(s) for s, cnt in sentence_counts.items() if cnt >= repetition_threshold]
    if over:
        over_pattern = re.compile(
            r'(?:' + "|".join(over) + r')'
        )
    else:
        over_pattern = None

    # Define a filter that removes those over-threshold sentences
    def filter_cell(cell: str) -> str:
        text = preprocess_text(str(cell))
        if over_pattern:
            # replace each long-repeated sentence with a single space
            text = over_pattern.sub(" ", text)
        # collapse multiple spaces & trim
        return re.sub(r'\s+', ' ', text).strip()

    # ✱ Use applymap so every cell gets filtered
    cleaned_df = df.map(filter_cell)

    # 5) Write results
    counts_df = (
        pd.DataFrame(sentence_counts.items(), columns=['sentence','count'])
          .sort_values('count', ascending=False)
          .reset_index(drop=True)
    )
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        cleaned_df.to_excel(writer, sheet_name='Cleaned', index=False)
        counts_df.to_excel(writer, sheet_name='Sentence Counts', index=False)

    print(f"Done! Saved to '{output_path}'")


if __name__ == "__main__":
    delete_sentences_by_repetition(
        input_path="/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_khowledgebase_editable.xlsx",
        output_path="/home/mahdi/word_embedding_Narenjestan/dataset/V_4_deduplicated_sentences.xlsx",
        drop_columns=['سطح سوم' , 'تعداد'],
        repetition_threshold=20
    )


Done! Saved to '/home/mahdi/word_embedding_Narenjestan/dataset/V_4_deduplicated_sentences.xlsx'


In [2]:
import pandas as pd
from preprocessing_main import preprocess
to_remove = [
        "مورد جهت بررسی به گروه فنی ارجاع گردد",
        "در صورت عدم رفع مشکل مورد جهت بررسی به گروه فنی ارجاع گردد",
        "در صورت عدم رفع مشکل مورد جهت بررسی به گروه فنی رجاع گردد",
        "ارجاع تماس به دیگر کارشناسان"
    ]
# input_path = "/home/mahdi/word_embedding_Narenjestan/dataset/MEC-Narenjestan_cleaned-V0.2.xlsx"
input_path = "/home/mahdi/word_embedding_Narenjestan/dataset/V_4_deduplicated_sentences.xlsx"
df_edited = pd.read_excel(input_path, dtype=str).fillna('')

# Clean every cell in the DataFrame
df_clean = df_edited.map(
        lambda cell: preprocess(
        str(cell),
        drop_short_phrases=2,
        replace_multiple_spaces=True,
        handle_prefix=True,
        remove_punctuation_exception_keep=['/'],
        remove_specific_phrases= to_remove
    )
)

df_clean.to_excel("/home/mahdi/word_embedding_Narenjestan/dataset/MEC-Narenjestan_cleaned-V0.5.xlsx",index=False)




def merge_two_columns_to_txt(
    input_path: str,
    output_txt: str,
    col1: str,
    col2: str,
    separator: str = " "
):
   
    df = pd.read_excel(input_path, dtype=str).fillna('')

    # Merge the two columns with your chosen separator
    merged = df[col1].astype(str) + separator + df[col2].astype(str)

    # Save to text (no header, no index)
    merged.to_csv(output_txt, index=False, header=False, sep="\n", encoding='utf-8')
    print(f"Merged {col1} + {col2} into '{output_txt}' ({len(merged)} lines).")


if __name__ == "__main__":
    merge_two_columns_to_txt(
        input_path="/home/mahdi/word_embedding_Narenjestan/dataset/MEC-Narenjestan_cleaned-V0.5.xlsx",   
        output_txt="/home/mahdi/word_embedding_Narenjestan/dataset/MEC-Narenjestan_cleaned-merged_output-V0.2.txt",  
        col1="سوال",                  
        col2="جواب",                  
        separator=" "                   
    )




Merged سوال + جواب into '/home/mahdi/word_embedding_Narenjestan/dataset/MEC-Narenjestan_cleaned-merged_output-V0.2.txt' (18874 lines).
