#### In this notebook, we are going to put together all the features we have built till now and upscale it
#### (May the Force be with Us)

In [245]:
import re
import nltk
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.lm import MLE, WittenBellInterpolated
from scipy.ndimage import gaussian_filter
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import spacy
from zipfile import ZipFile
from deep_translator import GoogleTranslator
from PyPDF2 import PdfReader
# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
# from pdfminer.converter import TextConverter
# from pdfminer.layout import LAParams
# from pdfminer.pdfpage import PDFPage
import io
import os
from io import StringIO

-> preProcess(text): returns text

- Remove puctuation, new lines, tabs and extra spaces.
- Remove "Machine Translated By Google" tag
- Remove the Bibliography and References Section


In [246]:
def preProcess(text):
    text = re.sub(r'[^\w\s]','',text)
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("Machine Translated by Google",' ')
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = " ".join(text.split())

    i = text.find('Bibliography')
    if i != -1:
        text = text[:i]
    i = text.find('References')
    if i != -1:
        text = text[:i]
    
    return text

-> extractText(fileName,start): 
- Returns extracted text from 'fileName' PDF from 'start' page

In [72]:
def extractText(fileName,start):
    reader = PdfReader(fileName)
    pages = len(reader.pages)
    story = """"""

    for i in range(start,pages):
        page = reader.pages[i]
        story = story + str(page.extractText())
    return story


-> extractFromZIP(zipFileName):
- Extracts the Corpus PDFs from the specified ZIP File

In [247]:
def extractFromZIP(zipFileName):
    # ZIP file should be in the same folder as the .ipynb file obviously
    # extract the zip file
    with ZipFile(zipFileName, 'r') as zipObj:
        zipObj.extractall()

In [249]:
# Load the language library
nlp = spacy.load('en_core_web_lg')

-> createCorpus():
- Create and Return List of Source Doc Names

In [279]:
def createCorpus():
    corpus = []
    for x in os.listdir():
        if (x.endswith(".pdf") or x.endswith(".PDF")):
            corpus.append(x)
    return corpus

-> createListOfCorpus(corpus):
- Create and return list of Individual contents of the Corpus Docs

In [250]:
def createListOfCorpus(corpus):
    listOfCorpus = []
    for x in range(len(corpus)):
        text = extractText(corpus[x],1)
        text = preProcess(text)
        listOfCorpus.append(text)
    return listOfCorpus

-> createReferences(corpus):
- Create and return dictionary of the Corpus Doc Names for mapping back Plagiarised Portions

In [292]:
def createReferences(corpus):
    references = {}
    for x in range(len(corpus)):
        references[x] = corpus[x]
    
    return references

-> createVector(listOfCorpus):
- Create and return list of vectors of the Individual contents of the Corpus Docs

In [251]:
def createVector(listOfCorpus):
    vector = []
    for x in listOfCorpus:
        vector.append(nlp(x))
    return vector

-> trainModel(trainingVector, n):
- Trains model based on Witten Bell Interpolation over trainingVector (of entire Corpus combined) and n-gram value 'n'
- Returns model

In [252]:
def trainModel(trainingVector, n):
    words = [w.text for w in trainingVector]
    training_data = list(pad_sequence(words, n, 
                                    pad_left=True, 
                                    left_pad_symbol="<s>"))
    # Generate n-grams from the training data
    ngrams = list(everygrams(training_data, max_len=n))
    # Build n-gram language model
    model = WittenBellInterpolated(n)
    model.fit([ngrams],vocabulary_text=training_data)
    return model

-> createTestData(fileName):
- Create the test data from the Suspicious Doc and return its vector

In [253]:
def createTestData(fileName):
    suspiciousText = extractText(fileName,0)
    suspiciousText = preProcess(suspiciousText)
    testVector = nlp(suspiciousText)
    words = [w.text for w in testVector]
    test_data = list(pad_sequence(words, n,
                                pad_left=True,
                                left_pad_symbol="<s>"))
    return test_data,testVector


-> generateScores(model, test_data):
- Get scores of probability of plagiarism on individual words based on context of neighbouring 'n' words
- Return numpy array of those scores

In [254]:
def generateScores(model, test_data):
    score=[]
    #source = []
    # Generate score
    for i,ele in enumerate(test_data[n-1:]):
        s = model.score(ele,test_data[i:i+n-1])
        score.append(s)

    # Convert to numpy array
    score_np=np.array(score)
    return score_np

-> generateData(width,score_np):
- Creating another numpy array of matrix dimensions to fit in a heatmap
- Creating source labels to label individual Plagiarised portions
- returning diff of size b/w 1D 'a' and score_np


In [255]:
def generateData(width,score_np):
    height=np.ceil(len(score_np)/width).astype("int32")
    # source label to be used to identify original source of plagiarised portions
    # Copy the score_np to a new array with shape (height,width)
    a=np.zeros(width*height)
    sourceLabel = ["Unplagiarised"]*(width*height)
    a[:len(score_np)] = score_np

    diff = len(a) - len(score_np)
    # Apply gaussian filter to the array
    a = gaussian_filter(a, sigma=1.0)
    # Reshape to fit rectangular shape
    a = a.reshape(-1,width)
    return a, sourceLabel, diff

-> getPlagPercentage(a,diff):
- Get back the Pecentage of Plagiarism
- Get back the indexes of the Plagiarised words

In [256]:
def getPlagPercentage(a,diff):
    a = a.reshape(-1)
    plagWds = 0
    plID=[] 
    # If a[i] > 0.62 or a[i-1]>0.58 or a[i+1]>0.58, then increase plagWds by 1
    for i in range(0,a.shape[0]-2): 
        if a[i] > 0.62 or a[i+1]>0.60 or a[i+2]>0.60:
            plagWds += 1
            plID.append(i)

    plagPercentage = plagWds*100/(len(a)-diff)

    return plagPercentage,plID

-> check_plagiarised(j,vecO,vecP,id,fin,plag_dict):
- Mapping the Plagiarised portions to their Sources
- Returns the dictionary with indexes of Plagiarised portions and their corresponding portions

In [257]:
def check_plagiarised(j,vecO,vecP,id,fin,plag_dict):
    for k in range(len(vecO)-3):
        # X is Vector of the plagiarized text present between indexes id and fin
        X1=vecP[id:id+20]
        X2 = vecP[fin-20:fin]
        X3 = vecP[id:fin]
        # Y is Vector of the original text present between indexes j and j+l
        Y1=vecO[k:k+20]
        Y2 = vecO[k:k+fin-id+1]
        
        #Computing cosine similarity
        sim1 = X1.similarity(Y1)
        sim2 = X2.similarity(Y1)
        sim3 = X3.similarity(Y2)

        # If the cosine similarity is greater than 0.997, then we get a match from original text
        if(sim1>0.997 or sim2>0.997 or sim3>0.997):
            if(j in plag_dict):
                a=[id,fin]
                b=[k,k+fin-id+1]
                plag_dict[j].append([a,b])
            else:
                a=[id,fin]
                b=[k,k+fin-id+1]
                plag_dict[j] = [[a,b]]
            break
    
    return plag_dict

-> getJumpStates(plID):
- Returns the specific Indexes of the Plagiarised portions from the Suspicious Doc

In [258]:
def getJumpStates(plID):
    jumpStates=[]
    jumpStates.append(plID[0])
    for i in range(len(plID)):
        if(plID[i]-plID[i-1]>11):
            jumpStates.append(plID[i-1])
            jumpStates.append(plID[i])

    jumpStates.append(plID[-1])
    return jumpStates

-> getPlagDict(jumpStates,vector,testVector):
- For each pair in the jumpStates, it adds its source to the plag_dict and returns the plag_dict after completion

In [259]:
def getPlagDict(jumpStates,vector,testVector):
    plag_dict = {}
    # Fill plag_dict with the plagiarised portions
    for i in range(0,len(jumpStates),2):
        id=jumpStates[i]
        fin=jumpStates[i+1]

        # Checking which document the plagiarised portion belongs to
        for j,vec in enumerate(vector):
            plag_dict=check_plagiarised(j,vector[j],testVector,id,fin,plag_dict)
            
    return plag_dict

-> populateSourceLabel(plag_dict,sourceLabel,references,width):
- Fills the SourceLabels with the original names of the sources from which plagiarised portions have been taken

In [260]:
def populateSourceLabel(plag_dict,sourceLabel,references,width):
    # Iterate over plag_dict
    for i in plag_dict:
        v = plag_dict[i]
        for j in v:
            # Extracting the plagiarised portions
            st = j[0][0]
            en = j[0][1]

            # Labelling those portions
            sourceLabel[st:en+1] = [references[i] for x in sourceLabel[st:en+1]]
            
    sourceLabel = np.array(sourceLabel)
    sourceLabel = sourceLabel.reshape(-1,width)
    return sourceLabel

-> createLabels(testVector,width,diff):
- Creates the content for writing in the Heatmaps
- Basically the entire suspicious doc

In [261]:
def createLabels(testVector,width,diff):
    # format labels
    labels = [" ".join(testVector[i:i+width].text.split()) for i in range(0, len(testVector), width)]
    labels_individual = [x.split() for x in labels]
    labels_individual[-1] += [""]*diff
    labels = [f"{x:60.60}" for x in labels]

    return labels,labels_individual

-> generateHeatMap(a,sourceLabel,width,height,labels_individual):
- Generates heatmap to help visualise the plagiarised portions
- Returns the created figure

In [262]:
def generateHeatMap(a,sourceLabel,width,height,labels_individual):
    # create heatmap of the Plagiarized Text
    a=a.reshape(-1,width)
    fig = go.Figure(data=go.Heatmap(
                    z=a, x0=0, dx=1,
                    #y=labels, 
                    zmin=0, zmax=1,
                    customdata=sourceLabel,
                    hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                    text=labels_individual,
                    texttemplate='%{text}',
                    textfont={"size":7},
                    colorscale='reds'
                    ))
    fig.update_layout({"height":height*25, "width":1000, "font":{"family":"sans-serif"}})
    fig['layout']['yaxis']['autorange'] = "reversed"
    return fig

In [280]:
# Call all the functions
extractFromZIP('Corpus.zip')
corpus = createCorpus()
references = createReferences(corpus)
listOfCorpus = createListOfCorpus(corpus)
vector = createVector(listOfCorpus)

In [281]:
megaCorpus = '\n'.join(listOfCorpus)
trainingVector = nlp(megaCorpus)

In [282]:
# Value of n for n-grams
n = 5

In [283]:
# Call function to train the model
model = trainModel(trainingVector,n)


In [284]:
test_data,testVector = createTestData("Suspicious Doc.pdf")

In [285]:
score_np = generateScores(model,test_data)

In [286]:
# Keeping width = 22 for readability
a,sourceLabel,diff = generateData(22,score_np)

In [287]:
plagPercentage,plID = getPlagPercentage(a,diff)

In [288]:
jumpStates = getJumpStates(plID)

In [289]:
plag_dict = getPlagDict(jumpStates,vector,testVector)

In [295]:
sourceLabel = populateSourceLabel(plag_dict,sourceLabel,references,22)

In [296]:
labels,labels_individual = createLabels(testVector,22,diff)

In [297]:
height = np.ceil(len(score_np)/22).astype("int32")
fig = generateHeatMap(a,sourceLabel,22,height,labels_individual)

In [298]:
fig.show()