In [None]:
!pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasketch import MinHash
import re
from Levenshtein import distance as levenshtein_distance
import hashlib

#MinHash Jaccard similarity

In [None]:
# Function to read the file and break it into sentences
def read_punjabi_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = re.split(r'[।?!]|।।', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Create DataFrame from sentences
def create_punjabi_df(sentences):
    df = pd.DataFrame(sentences, columns=["sentence"])
    df["hash"] = "NA"  
    return df

# MinHash calculation and deduplication
def calculate_minhash_and_deduplicate(df, threshold=0.2):
    print(f"Length of DataFrame before deduplication: {len(df)}")

    for i in range(len(df)):
        m1 = MinHash()
        for token in df.iloc[i, 0].split():  
            m1.update(token.encode('utf-8'))
        df.at[i, "hash"] = m1

    i = 0
    while i < len(df):
        index_list = []
        for j in range(i + 1, len(df)):
            if df.iloc[i, 1].jaccard(df.iloc[j, 1]) >= threshold:
                index_list.append(j)

        df = df.drop(index_list).reset_index(drop=True)
        i += 1

    print(f"Length of DataFrame after deduplication: {len(df)}")
    return df


file_path = "Timepass.txt" 
punjabi_sentences = read_punjabi_text_file(file_path)

df = create_punjabi_df(punjabi_sentences)
df = calculate_minhash_and_deduplicate(df)

print(f"Number of unique sentences: {len(df)}")

output_file_path = file_path.replace(".txt", "_de_duplicated.txt")
df.to_csv(output_file_path, index=False, header=False)

print(f"Deduplicated file saved to {output_file_path}")


4


#Deduplicate sentences using cosine similarity with TF-IDF

In [None]:

def cosine_similarity_deduplication(sentences, threshold=0.8):
    """Deduplicate sentences using cosine similarity with TF-IDF."""
    df = pd.DataFrame(sentences, columns=["sentence"])
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["sentence"])

    to_remove = set()

    for i in range(tfidf_matrix.shape[0]):
        if i in to_remove:
            continue
        cosine_similarities = cosine_similarity(tfidf_matrix[i], tfidf_matrix).flatten()
        for j in range(i + 1, len(cosine_similarities)):
            if cosine_similarities[j] >= threshold:
                to_remove.add(j)

    df = df.drop(index=to_remove).reset_index(drop=True)
    return df

# Example usage
# punjabi_sentences = ["ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਦੂਜਾ ਵਾਕ ਹੈ।"]
print(df)
df_cosine = cosine_similarity_deduplication(punjabi_sentences)
print("Cosine Similarity Deduplication:")
print(f"Number of unique sentences: {len(df_cosine)}")

output_file_path = file_path.replace(".txt", "_de_duplicated.txt")
df_cosine.to_csv(output_file_path, index=False, header=False)

print(f"Deduplicated file saved to {output_file_path}")


                                            sentence hash
0          ਸੀ ਦਸਮ ਗ੍ਰੰਥ ਸਾਹਿਬ ਜੀ ਦੇ ਪੰਨਾ ਨੰਬਰ ੧੫੫ ਤਕ   NA
1          ਸੀ ਦਸਮ ਗ੍ਰੰਥ ਸਾਹਿਬ ਜੀ ਦੇ ਪੰਨਾ ਨੰਬਰ ੧੫੫ ਤਕ   NA
2          ਸੀ ਦਸਮ ਗ੍ਰੰਥ ਸਾਹਿਬ ਜੀ ਦੇ ਪੰਨਾ ਨੰਬਰ ੧੫੫ ਤਕ   NA
3  ਕੂ ਚਿਹਨ ਅਰੁ ਬਰਨ ਜਾਤਿ ਅਰੁ ਪਾਤਿ ਨਹਿਨ ਜਿਹ ॥ \nਰੰਗ...   NA
Cosine Similarity Deduplication:
                                            sentence
0          ਸੀ ਦਸਮ ਗ੍ਰੰਥ ਸਾਹਿਬ ਜੀ ਦੇ ਪੰਨਾ ਨੰਬਰ ੧੫੫ ਤਕ
1  ਕੂ ਚਿਹਨ ਅਰੁ ਬਰਨ ਜਾਤਿ ਅਰੁ ਪਾਤਿ ਨਹਿਨ ਜਿਹ ॥ \nਰੰਗ...


#Deduplicate sentences using Levenshtein distance

In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein)
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-Levenshtein)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: r

In [None]:


def levenshtein_deduplication(sentences, max_distance=2):
    """Deduplicate sentences using Levenshtein distance."""
    df = pd.DataFrame(sentences, columns=["sentence"])
    to_remove = set()

    for i in range(len(df)):
        if i in to_remove:
            continue
        for j in range(i + 1, len(df)):
            if levenshtein_distance(df.iloc[i]["sentence"], df.iloc[j]["sentence"]) <= max_distance:
                to_remove.add(j)

    df = df.drop(index=to_remove).reset_index(drop=True)
    return df

# Example usage
# punjabi_sentences = ["ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਇਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਦੂਜਾ ਵਾਕ ਹੈ।"]
# df_levenshtein = levenshtein_deduplication(punjabi_sentences)
print("Levenshtein Distance Deduplication:")

print(f"Number of unique sentences: {len(df_levenshtein)}")

output_file_path = file_path.replace(".txt", "_de_duplicated.txt")
df_levenshtein.to_csv(output_file_path, index=False, header=False)

print(f"Deduplicated file saved to {output_file_path}")


Levenshtein Distance Deduplication:
             sentence
0  ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।
1     ਇਹ ਦੂਜਾ ਵਾਕ ਹੈ।


#Deduplicate sentences using fingerprinting (hashing).

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.1


In [None]:
def fingerprint_deduplication(sentences):
    """Deduplicate sentences using fingerprinting (hashing)."""
    df = pd.DataFrame(sentences, columns=["sentence"])
    df["hash"] = df["sentence"].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
    df.drop_duplicates(subset="hash", inplace=True)
    df.drop(columns=["hash"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# Example usage
# punjabi_sentences = ["ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਇੱਕ ਸਜਗ ਵਾਕ ਹੈ।", "ਇਹ ਦੂਜਾ ਵਾਕ ਹੈ।"]
df_fingerprint = fingerprint_deduplication(punjabi_sentences)
print("Fingerprinting Deduplication:")
print(f"Number of unique sentences: {len(df_fingerprint)}")

output_file_path = file_path.replace(".txt", "_de_duplicated.txt")
df_fingerprint.to_csv(output_file_path, index=False, header=False)

print(f"Deduplicated file saved to {output_file_path}")


Fingerprinting Deduplication:
                                            sentence
0          ਸੀ ਦਸਮ ਗ੍ਰੰਥ ਸਾਹਿਬ ਜੀ ਦੇ ਪੰਨਾ ਨੰਬਰ ੧੫੫ ਤਕ
1  ਕੂ ਚਿਹਨ ਅਰੁ ਬਰਨ ਜਾਤਿ ਅਰੁ ਪਾਤਿ ਨਹਿਨ ਜਿਹ ॥ \nਰੰਗ...
