##### In this Jupyter file, I am going to analyse plagiarism between two texts.
##### I will try to use the N-Gram Model for this Purpose.

In [1]:
import re
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.tokenize import word_tokenize
from nltk.lm import MLE, WittenBellInterpolated

In [2]:
import numpy as np
from scipy.ndimage import gaussian_filter
import plotly.graph_objects as go

In [3]:
import spacy
from numpy.linalg import norm

In [None]:
!python -m spacy download en_core_web_lg

In [5]:
# Load the language library
nlp = spacy.load('en_core_web_lg')

In [498]:
originalText="""It is challenging to understand the effects of pollution. 
It can affect nature, human beings, animals and play a vital role in adverse climate change. 
Depletion of natural resources and destroying habitats are significant effects of pollution. 
Pollution can harm humans by breathing in smoke or consuming contaminated food. 
Moreover, it can cause health problems such as asthma, cancer, and heart diseases. 
It also affects nature by destroying plants, animals, and natural habitats. 
Once kids have understood the meaning and effects of pollution, you can ask them to write an essay on pollution.
Pollution is omnipresent and has affected the environment in all aspects. 
Be it water, soil, or air, every element of the environment is polluted. 
Water is polluted due to the release of industrial waste into rivers and water bodies. 
When the water is contaminated, the supply of drinking water decreases. 
The soil is polluted by dumping non-biodegradable elements, resulting in toxicity. 
The emission of harmful gases pollutes the air.
A high amount of carbon dioxide emission has led us to a dangerous point and is harmful to every living thing on Earth. 
Excessive usage of pesticides for crops has harmed the microorganisms living in the soil.
When we follow incorrect ways of waste management, we contribute to pollution."""

originalText = originalText.replace('\n', ' ')
originalText = originalText.replace('\t', '')
# Remove punctuation
originalText = re.sub(r'[^\w\s]', '', originalText)

In [499]:
# Text Copied from OriginalText (not fully copied)
plagText="""In the hilly areas, people experience snow during winters.
They have to shovel it out of the way to make way for walking. 
The essence of winters is enhanced by Christmas as well. 
It sets the holiday mood for people and is admired all over the world.
It is challenging to understand the effects of pollution. 
It can affect nature, human beings, animals and play a vital role in adverse climate change. 
Depletion of natural resources and destroying habitats are significant effects of pollution. 
Pollution can harm humans by breathing in smoke or consuming contaminated food.
Winter is one of the most important seasons in India. It is a part of the four seasons that occur in India. 
Winters are the coolest season that starts from December and last till March. 
The peak time when winter is experienced the most in December and January. In India, winters hold great importance.
The soil is polluted by dumping non-biodegradable elements, resulting in toxicity. 
The emission of harmful gases pollutes the air.
A high amount of carbon dioxide emission has led us to a dangerous point and is harmful to every living thing on Earth.
During winters, schools usually take a break and close down. The days are shorter and the nights get longer. 
The chilly mornings give you a different sense altogether. 
Hot drinks like coffee, tea, and hot chocolate are enjoyed more during winters. 
The sun rises quite late and sometimes it does not."""

plagText = plagText.replace('\n', ' ')
plagText = plagText.replace('\t', '')
# Remove punctuation
plagText = re.sub(r'[^\w\s]', '', plagText)

In [500]:
vecOriginalText = nlp(originalText)

In [501]:
vecPlagText = nlp(plagText)

In [502]:
n=4

In [503]:
wordOriginal = [word.text for word in vecOriginalText]

In [504]:
training_data = list(pad_sequence(wordOriginal, n, 
                                  pad_left=True, 
                                  left_pad_symbol="<s>"))

In [505]:
# Generate n-grams from the training data
ngrams = list(everygrams(training_data, max_len=n))
# Build n-gram language model
model = WittenBellInterpolated(n)
model.fit([ngrams],vocabulary_text=training_data)

In [506]:
wordPlag = [word.text for word in vecPlagText]

In [507]:
test_data = list(pad_sequence(wordPlag, n, 
                                  pad_left=True, 
                                  left_pad_symbol="<s>"))

In [508]:
score=[]
# Generate score
for i,ele in enumerate(test_data[n:]):
    s = model.score(ele,test_data[i:i+n])
    score.append(s)

In [509]:
# Convert to numpy array
score_np=np.array(score)


In [510]:
# Set height and width of the graph
width=14
height=np.ceil(len(score_np)/width).astype("int32")

In [511]:
# Copy the score_np to a new array with shape (height,width)
a=np.zeros(width*height)
a[:len(score_np)] = score_np
diff = len(a) - len(score_np)

In [512]:
# Apply gaussian filter to the array
a = gaussian_filter(a, sigma=1.0)
a=a*1.093 # To make the probabilties better (which have been lowered because of the gaussian filter)
# Reshape to fit rectangular shape
a = a.reshape(-1,width)

In [513]:
# format labels
labels = [" ".join(vecPlagText[i:i+width].text.split()) for i in range(0, len(vecPlagText), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*diff
labels = [f"{x:60.60}" for x in labels]

In [514]:
# create heatmap of the Plagiarized Text
fig = go.Figure(data=go.Heatmap(
                z=a, x0=0, dx=1,
                #y=labels, 
                zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                text=labels_individual,
                texttemplate='%{text}',
                textfont={"size":8},
                colorscale='reds'
                ))
fig.update_layout({"height":height*25, "width":1000, "font":{"family":"sans-serif"}})
fig['layout']['yaxis']['autorange'] = "reversed"


In [515]:
# Show the plot
fig.show()

In [516]:
vecPlagText[10:14].text

'have to shovel it'

In [517]:
# Reshape a to linear array
a = a.reshape(-1)

plagScore=np.sum(a[n-1:a.shape[0]-diff])/(a.shape[0]-diff-n) # Avg. Score (excluding the padding and blank text)
plagPercentage=plagScore*100
plagPercentage

33.53983290961752

In [518]:
plID=[] 
# The index of the tokens which are plagiarized (index wrt to plagiarized text)
for i in range(len(a)-2):
    if(a[i-1]>0.55 or a[i]>0.6 or a[i+1]>0.55 or a[i+2]>0.55):
        #pl.append(testing_data[i+n-1])
        plID.append(i)

In [519]:
# To note the fact that plagiarism can be scattered
# Anytime the indexes difference > 4, then it means those two plagiarised words came from different parts of original doc (Flaw exists but we'll improve)
jumpStates=[]
jumpStates.append(plID[0])
for i in range(len(plID)):
    if(plID[i]-plID[i-1]>4):
        jumpStates.append(plID[i-1])
        jumpStates.append(plID[i])

jumpStates.append(plID[-1])

In [520]:

originalScore=[]
# Fill originalScore with zeros
for i in range(len(vecOriginalText)):
    originalScore.append(0)

# Fill originalScore with scores of plagiarized words whenever we detect the phrase in original text
for i in range(0,len(jumpStates),2):
    id=jumpStates[i]
    fin=jumpStates[i+1]

    for j in range(len(vecOriginalText)-3):
        # X is Vector of the plagiarized text present between indexes id and fin
        X=vecPlagText[id:fin+1]
        # Y is Vector of the original text present between indexes j and j+l
        Y=vecOriginalText[j:j+fin-id+1]
        
        #Computing cosine similarity
        sim = X.similarity(Y)

        # If the cosine similarity is greater than 0.999, then we get a match from original text
        if(sim>0.999):
            originalScore[j:j+fin-id+1]=a[id:fin+1]
            break
        

In [521]:
originalScore_np=np.array(originalScore)

In [522]:
# Set height and width of the graph
width=12
height=np.ceil(len(originalScore_np)/width).astype("int32")

In [523]:
# Copy the score_np to a new array with shape (height,width)
b=np.zeros(width*height)
b[:len(originalScore_np)] = originalScore_np
d = len(b) - len(originalScore_np)

In [524]:
# Apply gaussian filter to the array
b = gaussian_filter(b, sigma=1.0)
#b=b*1.093
# Reshape to fit rectangular shape
b = b.reshape(-1,width)

In [525]:
labels = [" ".join(vecOriginalText[i:i+width].text.split()) for i in range(0, len(vecOriginalText), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*d
labels = [f"{x:60.60}" for x in labels]

In [526]:
# create heatmap of the Plagiarized Text
fig = go.Figure(data=go.Heatmap(
                z=b, x0=0, dx=1,
                #y=labels, 
                zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                text=labels_individual,
                texttemplate='%{text}',
                textfont={"size":8},
                colorscale='reds'
                ))
fig.update_layout({"height":height*25, "width":1000, "font":{"family":"sans-serif"}})
fig['layout']['yaxis']['autorange'] = "reversed"

In [527]:
fig.show()