##### In this Jupyter file, I am going to analyse plagiarism between two texts.
##### I will try to use the N-Gram Model for this Purpose.

In [3]:
import re
from nltk.util import ngrams, pad_sequence, everygrams
from nltk.tokenize import word_tokenize
from nltk.lm import MLE, WittenBellInterpolated

In [4]:
import numpy as np
from scipy.ndimage import gaussian_filter
import plotly.graph_objects as go

In [5]:
originalText="""It is challenging to understand the effects of pollution. 
It can affect nature, human beings, animals and play a vital role in adverse climate change. 
Depletion of natural resources and destroying habitats are significant effects of pollution. 
Pollution can harm humans by breathing in smoke or consuming contaminated food. 
Moreover, it can cause health problems such as asthma, cancer, and heart diseases. 
It also affects nature by destroying plants, animals, and natural habitats. 
Once kids have understood the meaning and effects of pollution, you can ask them to write an essay on pollution.
Pollution is omnipresent and has affected the environment in all aspects. 
Be it water, soil, or air, every element of the environment is polluted. 
Water is polluted due to the release of industrial waste into rivers and water bodies. 
When the water is contaminated, the supply of drinking water decreases. 
The soil is polluted by dumping non-biodegradable elements, resulting in toxicity. 
The emission of harmful gases pollutes the air.
A high amount of carbon dioxide emission has led us to a dangerous point and is harmful to every living thing on Earth. 
Excessive usage of pesticides for crops has harmed the microorganisms living in the soil.
When we follow incorrect ways of waste management, we contribute to pollution."""

In [6]:
# Convert to lower case and remove punctuation
originalText = originalText.lower()
originalText = re.sub(r'[^\w\s]', '', originalText)


In [7]:
# Remove new lines
originalText = originalText.replace('\n', ' ')

In [8]:
# remove text inside square and curly brackets
originalText = re.sub(r'\[.*?\]|\{.*\}', '', originalText)

In [9]:
# set N-Gram number
n = 4

In [10]:
# Pad the text and tokenize
training_data = list(pad_sequence(word_tokenize(originalText), n, 
                                  pad_left=True, 
                                  left_pad_symbol="<s>"))

In [11]:
# Generate n-grams
ngrams = list(everygrams(training_data, max_len=n))

In [12]:
# Build n-gram language model
model = WittenBellInterpolated(n)
model.fit([ngrams],vocabulary_text=training_data)


In [123]:
# Text Copied from OriginalText
plagText="""It is challenging to understand the effects of pollution. 
It can affect nature, human beings, animals and play a vital role in adverse climate change. 
Depletion of natural resources and destroying habitats are significant effects of pollution. 
Pollution can harm humans by breathing in smoke or consuming contaminated food.
Winter is one of the most important seasons in India. It is a part of the four seasons that occur in India. 
Winters are the coolest season that starts from December and last till March. 
The peak time when winter is experienced the most in December and January. In India, winters hold great importance.
The soil is polluted by dumping non-biodegradable elements, resulting in toxicity. 
The emission of harmful gases pollutes the air.
A high amount of carbon dioxide emission has led us to a dangerous point and is harmful to every living thing on Earth.
During winters, schools usually take a break and close down. The days are shorter and the nights get longer. 
The chilly mornings give you a different sense altogether. Hot drinks like coffee, tea, and hot chocolate are enjoyed more during winters. 
The sun rises quite late and sometimes it does not."""

In [124]:
# Convert to lower case and remove punctuation
plagText = plagText.lower()
plagText = re.sub(r'[^\w\s]', '', plagText)


In [125]:
# Remove new lines
plagText = plagText.replace('\n', ' ')


In [126]:
testing_data = list(pad_sequence(word_tokenize(plagText), n, 
                                 pad_left=True,
                                 left_pad_symbol="<s>"))

In [127]:
score=[]
# Generate score
for i,ele in enumerate(testing_data[n-1:]):
    s = model.score(ele,testing_data[i:i+n-1])
    score.append(s)



In [128]:
score

[0.6773364485981308,
 0.7814122533748702,
 0.771320093457944,
 0.8785046728971962,
 0.7714174454828661,
 0.8820093457943925,
 0.7629566694987255,
 0.9407126168224299,
 0.8955899532710281,
 0.2753823279524214,
 0.807632398753894,
 0.6568341121495327,
 0.8761682242990654,
 0.8130841121495327,
 0.8755841121495327,
 0.8761682242990654,
 0.9197819314641744,
 0.6412091121495327,
 0.8767523364485981,
 0.7922507788161994,
 0.8755841121495327,
 0.8779205607476636,
 0.7755841121495327,
 0.8755841121495327,
 0.8755841121495327,
 0.8755841121495327,
 0.8814252336448598,
 0.7635514018691589,
 0.8130841121495327,
 0.8796728971962617,
 0.7667932242990654,
 0.8136682242990654,
 0.8130841121495327,
 0.8755841121495327,
 0.8767523364485981,
 0.9407126168224299,
 0.8955899532710281,
 0.2991716227697536,
 0.6498513169073916,
 0.7818341121495327,
 0.8755841121495327,
 0.8767523364485981,
 0.7922507788161994,
 0.8779205607476636,
 0.7755841121495327,
 0.8761682242990654,
 0.8130841121495327,
 0.876168224299

In [129]:
# Convert to numpy array
score_np=np.array(score)
score_np

array([0.67733645, 0.78141225, 0.77132009, 0.87850467, 0.77141745,
       0.88200935, 0.76295667, 0.94071262, 0.89558995, 0.27538233,
       0.8076324 , 0.65683411, 0.87616822, 0.81308411, 0.87558411,
       0.87616822, 0.91978193, 0.64120911, 0.87675234, 0.79225078,
       0.87558411, 0.87792056, 0.77558411, 0.87558411, 0.87558411,
       0.87558411, 0.88142523, 0.7635514 , 0.81308411, 0.8796729 ,
       0.76679322, 0.81366822, 0.81308411, 0.87558411, 0.87675234,
       0.94071262, 0.89558995, 0.29917162, 0.64985132, 0.78183411,
       0.87558411, 0.87675234, 0.79225078, 0.87792056, 0.77558411,
       0.87616822, 0.81308411, 0.87616822, 0.81308411, 0.        ,
       0.03271028, 0.        , 0.05140187, 0.07523364, 0.        ,
       0.        , 0.        , 0.02336449, 0.        , 0.02336449,
       0.12564901, 0.00292056, 0.        , 0.05140187, 0.07523364,
       0.        , 0.        , 0.        , 0.        , 0.02336449,
       0.        , 0.        , 0.0046729 , 0.02803738, 0.     

In [143]:
# Set height and width of the graph
width=8
height=np.ceil(len(score_np)/width).astype("int32")

In [144]:
print(height,width)

25 8


In [145]:
# Copy the score_np to a new array with shape (height,width)
a=np.zeros(width*height)
a[:len(score_np)] = score_np
diff = len(a) - len(score_np)

In [146]:
# Apply gaussian filter to the array
a = gaussian_filter(a, sigma=1.0)
a=a*1.093 # To make the probabilties better (which have been lowered because of the gaussian filter)
# Reshape to fit rectangular shape
a = a.reshape(-1,width)

In [147]:
# format labels
labels = [" ".join(testing_data[i:i+width]) for i in range(n-1, len(testing_data), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*diff
labels = [f"{x:60.60}" for x in labels]

In [150]:
# create heatmap of the Plagiarized Text
fig = go.Figure(data=go.Heatmap(
                z=a, x0=0, dx=1,
                y=labels, zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                colorscale='blues'))
fig.update_layout({"height":height*25, "width":1000, "font":{"family":"Courier New"}})
fig['layout']['yaxis']['autorange'] = "reversed"

In [151]:
fig.show()

In [152]:
# Reshape a to linear array
a = a.reshape(-1)

plagScore=np.sum(a[n-1:a.shape[0]-diff])/(a.shape[0]-diff-n) # Avg. Score (excluding the padding and blank text)
plagPercentage=plagScore*100

In [153]:
plagPercentage

40.453775994373075

In [163]:
plID=[] 
# The index of the tokens which are plagiarized (index wrt to plagiarized text)
for i in range(len(score)-1):
    if(score[i-1]>0.65 or score[i]>0.7 or score[i+1]>0.65):
        #pl.append(testing_data[i+n-1])
        plID.append(i)


In [164]:
# To note the fact that plagiarism can be scattered
# Anytime the indexes difference > 4, then it means those two plagiarised words came from different parts of original doc (Flaw exists but we'll improve)

jumpStates=[]
jumpStates.append(plID[0])
for i in range(len(plID)):
    if(plID[i]-plID[i-1]>4):
        jumpStates.append(plID[i-1])
        jumpStates.append(plID[i])

jumpStates.append(plID[-1])

In [183]:

originalScore=[]
# Fill originalScore with zeros
for i in range(len(training_data)):
    originalScore.append(0)

# Fill originalScore with scores of plagiarized words whenever we detect the phrase in original text
for i in range(0,len(jumpStates),2):
    id=jumpStates[i]
    fin=jumpStates[i+1]

    for j in range(len(training_data)-3):
        if((testing_data[id]==training_data[j] and testing_data[id+1]==training_data[j+1] and testing_data[id+2]==training_data[j+2])
            or
            (testing_data[id+1]==training_data[j] and testing_data[id+2]==training_data[j+1] and testing_data[id+3]==training_data[j+2])): #and pl[id+3]==training_data[j+3]):
            originalScore[j:j+fin-id+1]=a[id:fin+1]
            break
    


In [184]:
testing_data[105:150]


['importance',
 'the',
 'soil',
 'is',
 'polluted',
 'by',
 'dumping',
 'nonbiodegradable',
 'elements',
 'resulting',
 'in',
 'toxicity',
 'the',
 'emission',
 'of',
 'harmful',
 'gases',
 'pollutes',
 'the',
 'air',
 'a',
 'high',
 'amount',
 'of',
 'carbon',
 'dioxide',
 'emission',
 'has',
 'led',
 'us',
 'to',
 'a',
 'dangerous',
 'point',
 'and',
 'is',
 'harmful',
 'to',
 'every',
 'living',
 'thing',
 'on',
 'earth',
 'during',
 'winters']

In [185]:
originalScore_np=np.array(originalScore)

In [186]:
# Set height and width of the graph
width=8
height=np.ceil(len(originalScore_np)/width).astype("int32")

In [187]:
print(height,width)

27 8


In [188]:
# Copy the score_np to a new array with shape (height,width)
b=np.zeros(width*height)
b[:len(originalScore_np)] = originalScore_np
d = len(b) - len(originalScore_np)

In [189]:
# Apply gaussian filter to the array
b = gaussian_filter(b, sigma=1.0)
# Reshape to fit rectangular shape
b = b.reshape(-1,width)

In [190]:
labels = [" ".join(training_data[i:i+width]) for i in range(n-1, len(training_data), width)]
labels_individual = [x.split() for x in labels]
labels_individual[-1] += [""]*d
labels = [f"{x:60.60}" for x in labels]

In [191]:
# create heatmap for original text
fig = go.Figure(data=go.Heatmap(
                z=b, x0=0, dx=1,
                y=labels, zmin=0, zmax=1,
                customdata=labels_individual,
                hovertemplate='%{customdata} <br><b>Score:%{z:.3f}<extra></extra>',
                colorscale='blues'))
fig.update_layout({"height":height*25, "width":1000, "font":{"family":"Courier New"}})
fig['layout']['yaxis']['autorange'] = "reversed"

In [192]:
fig.show()