In [117]:
import pandas as pd         #python package for data analysis
import neattext as nt       #python package for text cleaning                                                   
import nltk                 #python package for text analysis
nltk.download('punkt')      #nltk dependency for sentence tokenization
import PyPDF2               #python package for extracting data from pdf 
import os                   #importing os to deal with system directories
import re                   #re-> to handle regular expressions
import tabula               #python package for tabular data extraction from pdf
from sklearn.feature_extraction.text import TfidfVectorizer        #sklearn function for TF-IDF
from sklearn.metrics.pairwise import cosine_similarity             #to find similarity among the chunks

[nltk_data] Downloading package punkt to C:\Users\Zee
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [118]:
partitions_similarity=[]           #creating storage for partitions similarity

In [113]:

def pre_processing(text):
    text=nt.fix_contractions(text)                   #I'm --> I am
    text=nt.remove_special_characters(text)
    text=nt.remove_stopwords(text)
    text=nt.remove_shortwords(text,3)
    text=nt.remove_numbers(text)
    text=nt.remove_urls(text)
    text=nt.remove_non_ascii(text)
    return text


def extract_tables(file_name):
    df = tabula.read_pdf(file_name,pages='all')
    return df


def extract_text(pdf_path):
    text=''
    with open(pdf_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        # Get the number of pages in the PDF file
        num_pages = pdf_reader.getNumPages()
        for page_num in range(num_pages):
            # Get the page object
            page_obj = pdf_reader.getPage(page_num)
            # Extract the text from the page
            text += page_obj.extract_text()
    return text


    def split_into_chunks(pdf_text):
        # Split the input string into sentences
        sentences = nltk.sent_tokenize(pdf_text)
        # Split each sentence into chunks of at most 1600 characters
        chunks = []
        current_chunk = ''
        for sentence in sentences:
            if len(current_chunk) + len(sentence) + 1 <= 1600:
                current_chunk += ' ' + sentence
            else:
                if len(current_chunk) >= 100:
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence
                else:
                    current_chunk += ' ' + sentence
        # Add the last chunk
        if len(current_chunk) >= 100:
            chunks.append(current_chunk.strip())
        return chunks
    
def merge_chunks(chunks, similarity_matrix, most_similar):
    merged_chunks = []
    merged_indices = set()
    
    for i in range(len(chunks)):
        if i not in merged_indices:
            current_chunk = chunks[i]
            merged_indices.add(i)
            j = most_similar[i]
            while j not in merged_indices and similarity_matrix[i][j] > 0.5:
                current_chunk += ' ' + chunks[j]
                merged_indices.add(j)
                j = most_similar[j]
            merged_chunks.append(current_chunk)
    
    return merged_chunks    
    
def get_paritions_similarity(partitions):
    # Use TfidfVectorizer to compute the TF-IDF matrix for the chunks
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(partitions)
    # Compute the cosine similarity matrix for the chunks
    similarity_matrix= cosine_similarity(tfidf_matrix)
    most_similar = similarity_matrix.argsort()[:, -2]
    merged_partitions = merge_chunks(partitions, similarity_matrix, most_similar)


    for i in range(len(partitions)):
        for j in range(len(partitions)):
            if i != j:
                similarity = similarity_matrix[i][j]
                partitions_similarity.append(f"Similarity between chunk {i+1} and chunk {j+1}: {similarity:.2f}")
    
    return partitions_similarity,merged_partitions
    
    
def merge_chunks(chunks, similarity_matrix, most_similar):
    merged_chunks = []
    merged_indices = set()

    for i in range(len(chunks)):
        if i not in merged_indices:
            current_chunk = chunks[i]
            merged_indices.add(i)
            j = most_similar[i]
            while j not in merged_indices and similarity_matrix[i][j] > 0.5:
                current_chunk += ' ' + chunks[j]
                merged_indices.add(j)
                j = most_similar[j]
            merged_chunks.append(current_chunk)

    return merged_chunks   
    
def save_partitions(partitions, file_path):
    file_dir, file_name = os.path.split(file_path)
    file_root, file_ext = os.path.splitext(file_name)
    for i, partition in enumerate(partitions):
        partition_file_name = f"{file_root}_part_{i+1}{file_ext}"
        partition_file_path = os.path.join(file_dir, partition_file_name)
        with open(partition_file_path, 'w', encoding='utf-8') as f:
            f.write(partition)   

    
        
##########################################################################################################
pdf_path ='3404555.3404559.pdf'
text = extract_text(pdf_path)
#######################################################################################################
#tables=extract_tables(pdf_path)
#for i,j in enumerate(tables):
#    j.to_csv(f'tabl{i}.csv')
#########################################################################################################
partitions = partition_text(text)
#########################################################################################################
#un-comment this code if you want to perform preprocessing on each partition
#for index,part in enumerate(partitions):
 #   partitions[index]=pre_processing(part)
#########################################################################################################
partitions_sim,merged_partitions=get_paritions_similarity(partitions)
#########################################################################################################
save_partitions(partitions,pdf_path)
####################################################################################################

In [114]:
partitions_sim

['Similarity between chunk 1 and chunk 10: 0.02',
 'Similarity between chunk 1 and chunk 10: 0.26',
 'Similarity between chunk 1 and chunk 10: 0.26',
 'Similarity between chunk 1 and chunk 11: 0.01',
 'Similarity between chunk 1 and chunk 11: 0.11',
 'Similarity between chunk 1 and chunk 11: 0.11',
 'Similarity between chunk 1 and chunk 12: 0.02',
 'Similarity between chunk 1 and chunk 12: 0.11',
 'Similarity between chunk 1 and chunk 12: 0.11',
 'Similarity between chunk 1 and chunk 13: 0.01',
 'Similarity between chunk 1 and chunk 14: 0.02',
 'Similarity between chunk 1 and chunk 15: 0.02',
 'Similarity between chunk 1 and chunk 16: 0.00',
 'Similarity between chunk 1 and chunk 17: 0.02',
 'Similarity between chunk 1 and chunk 18: 0.01',
 'Similarity between chunk 1 and chunk 19: 0.00',
 'Similarity between chunk 1 and chunk 20: 0.00',
 'Similarity between chunk 1 and chunk 21: 0.02',
 'Similarity between chunk 1 and chunk 22: 0.02',
 'Similarity between chunk 1 and chunk 23: 0.01',


In [115]:
partitions

["Is\nYour Marriage R eliable? D ivorce A nalysiswith Machine \nLearningA lgorithm\ns  \nJue Kong\n \nChang'anuniversity \nShangYuanRoad \nXi'an,China, \n710021\n \n+8 618994535561 \n2017901295@chd.edu.cn \n: \nThese aut hors cont ribut ed equal ly to this work. *: cor responding aut hor \nTianruiChai\n*\n \nBeihangUniversity \nXueYuanRoadNo.37 \nBeijing,China,100191 \n+8618914739576 trchai@buaa.edu.cn",
 "ABSTRA CT\n \nIn \nrece nt y ears, g\nlobal divorc e ra te is still  high\n. What kind of \ncoupl e will divorc e  and what factors lea d to divorc e are  \nim\nporta nt \nproble ms tha\nt \nworth study ing . In this paper , \nwe\n apply  three \nmac hine  lea rning al gorithm\ns \n(Support Vec tor Mac hine  (\nSVM\n)\n, \nRandom fore st (\nRF\n) and Natural Gradient Boosting  (N\nGB\noost )) on \na \ndivorce pre diction data set. The dataset consists of 1 7\n0 \nsample s, eac h of which contains 54 questi ons about the coupl e's \nemotiona l stat us. We\n rega rd the score s of 54 q

In [116]:
merged_partitions

["Is\nYour Marriage R eliable? D ivorce A nalysiswith Machine \nLearningA lgorithm\ns  \nJue Kong\n \nChang'anuniversity \nShangYuanRoad \nXi'an,China, \n710021\n \n+8 618994535561 \n2017901295@chd.edu.cn \n: \nThese aut hors cont ribut ed equal ly to this work. *: cor responding aut hor \nTianruiChai\n*\n \nBeihangUniversity \nXueYuanRoadNo.37 \nBeijing,China,100191 \n+8618914739576 trchai@buaa.edu.cn",
 "ABSTRA CT\n \nIn \nrece nt y ears, g\nlobal divorc e ra te is still  high\n. What kind of \ncoupl e will divorc e  and what factors lea d to divorc e are  \nim\nporta nt \nproble ms tha\nt \nworth study ing . In this paper , \nwe\n apply  three \nmac hine  lea rning al gorithm\ns \n(Support Vec tor Mac hine  (\nSVM\n)\n, \nRandom fore st (\nRF\n) and Natural Gradient Boosting  (N\nGB\noost )) on \na \ndivorce pre diction data set. The dataset consists of 1 7\n0 \nsample s, eac h of which contains 54 questi ons about the coupl e's \nemotiona l stat us. We\n rega rd the score s of 54 q

In [34]:
pdf_path ='3404555.3404559.pdf'

In [None]:
tables = camelot.read_pdf(pdf_path)

In [62]:
def split_into_chunks(input_str):
    # Split the input string into sentences
    sentences = nltk.sent_tokenize(input_str)
    
    # Split each sentence into chunks of at most 1600 characters
    chunks = []
    current_chunk = ''
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= 1600:
            current_chunk += ' ' + sentence
        else:
            if len(current_chunk) >= 100:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += ' ' + sentence

    # Add the last chunk
    if len(current_chunk) >= 100:
        chunks.append(current_chunk.strip())
    
    return chunks


In [63]:
chunks=split_into_chunks(text)

In [64]:
len(chunks[0])

1367

In [65]:
for i in chunks:
    print(len(i))

1367
1551
1506
1571
1520
1508
1568
1544
1548
1389
1590
764


In [68]:
def get_similarity():
    for i in partitions:
        

IndentationError: expected an indented block (<ipython-input-68-d8f36e87ba3d>, line 2)

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
def compute_similarity_matrix(chunks):
    # Use TfidfVectorizer to compute the TF-IDF matrix for the chunks
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    # Compute the cosine similarity matrix for the chunks
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

In [74]:
compute_similarity_matrix(partitions)

array([[1.        , 0.01167023, 0.03180888, 0.01730063, 0.02363675,
        0.04050599, 0.02797332, 0.0235127 , 0.02942829, 0.02479206,
        0.00680429, 0.02452421, 0.01271764, 0.01867512, 0.02100826,
        0.        , 0.01740823, 0.01022291, 0.        , 0.        ,
        0.02384696, 0.02008367, 0.00624024],
       [0.01167023, 1.        , 0.25043874, 0.20297419, 0.32358406,
        0.10951833, 0.26281601, 0.18185891, 0.1339797 , 0.09313846,
        0.13057086, 0.14768684, 0.18364704, 0.1603797 , 0.26330232,
        0.0796284 , 0.21198651, 0.18584334, 0.12826052, 0.04132894,
        0.10508626, 0.12184493, 0.07695128],
       [0.03180888, 0.25043874, 1.        , 0.20863631, 0.22250012,
        0.12264005, 0.13575169, 0.20442386, 0.12674145, 0.14926954,
        0.12925914, 0.17413693, 0.18026368, 0.15443224, 0.24604258,
        0.11905894, 0.18375341, 0.20553221, 0.10402972, 0.05521893,
        0.06050145, 0.09639244, 0.08706499],
       [0.01730063, 0.20297419, 0.20863631, 1.   

In [76]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
def plot_similarity_matrix(chunks, similarity_matrix):
    # Create a DataFrame from the similarity matrix
    df = pd.DataFrame(similarity_matrix, columns=chunks)
    
    # Create a heatmap from the DataFrame
    sns.heatmap(df, cmap='coolwarm', annot=True, fmt='.2f', vmin=0, vmax=1)
    plt.title('Chunk Similarity')
    plt.xlabel('Chunk')
    plt.ylabel('Chunk')
    plt.show()


similarity_matrix = compute_similarity_matrix(partitions)
plot_similarity_matrix(chunks, similarity_matrix)

ValueError: Shape of passed values is (23, 23), indices imply (23, 12)

In [79]:
for i in range(len(partitions)):
    for j in range(len(partitions)):
        if i != j:
            similarity = similarity_matrix[i][j]
            print(f"Similarity between chunk {i+1} and chunk {j+1}: {similarity:.2f}")

Similarity between chunk 1 and chunk 2: 0.01
Similarity between chunk 1 and chunk 3: 0.03
Similarity between chunk 1 and chunk 4: 0.02
Similarity between chunk 1 and chunk 5: 0.02
Similarity between chunk 1 and chunk 6: 0.04
Similarity between chunk 1 and chunk 7: 0.03
Similarity between chunk 1 and chunk 8: 0.02
Similarity between chunk 1 and chunk 9: 0.03
Similarity between chunk 1 and chunk 10: 0.02
Similarity between chunk 1 and chunk 11: 0.01
Similarity between chunk 1 and chunk 12: 0.02
Similarity between chunk 1 and chunk 13: 0.01
Similarity between chunk 1 and chunk 14: 0.02
Similarity between chunk 1 and chunk 15: 0.02
Similarity between chunk 1 and chunk 16: 0.00
Similarity between chunk 1 and chunk 17: 0.02
Similarity between chunk 1 and chunk 18: 0.01
Similarity between chunk 1 and chunk 19: 0.00
Similarity between chunk 1 and chunk 20: 0.00
Similarity between chunk 1 and chunk 21: 0.02
Similarity between chunk 1 and chunk 22: 0.02
Similarity between chunk 1 and chunk 23: 0