In [2]:
# download necessary nltk resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Import other necessary libraries
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.svm import SVC  #implements svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
data = pd.read_csv("dataset.csv")

In [4]:
data.shape

(402, 3)

In [5]:
data['Label'].value_counts()

Label
1    201
0    201
Name: count, dtype: int64

In [6]:
# Function to convert POS tags to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun

In [7]:
# Define a list of common words that should not be flagged as plagiarism
common_words = ["name", "the", "and", "is", "a", "to", "in", "of", "for", "on", "with"]

def preprocess_text(text):
    # Clean up and convert text to lowercase
    text_cleaned = text.strip().lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and common words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and word not in common_words]
    
    # Lemmatize with POS tagging
    lemmatizer = WordNetLemmatizer()
    words_pos = pos_tag(filtered_words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in words_pos]

    # Join the words back into a single string
    return " ".join(lemmatized_words)



In [21]:
output = preprocess_text("""
running @!&#
""")

print(output)


run


In [9]:
data['Original_Text'] = data['Original_Text'].apply(preprocess_text)
data['suspicious_Text'] = data['suspicious_Text'].apply(preprocess_text)

In [10]:
data

Unnamed: 0,Original_Text,suspicious_Text,Label
0,personality behavioural change another critica...,change attitude personality important factor t...,1
1,2008 financial crisis trend increase income in...,2008 financial crisis trend increase income in...,1
2,quick brown fox jump lazy dog,fast brown fox leap lazy dog,1
3,sun shin brightly,sun shine brightly,1
4,researcher discover new specie butterfly amazo...,scientist find previously unknown butterfly sp...,1
...,...,...,...
397,might think chemistry context lab test food ad...,people may think chemistry useful chemist rese...,0
398,excessive screen time toddler lead negative ou...,screen time reduces amount quality interaction...,0
399,fossil fuel coal oil gas far large contributor...,climate change press global challenge pose ser...,0
400,technological evolution also improve efficienc...,technology improve life many way development a...,0


In [11]:
#converting text to numerical form
tfidf_vectorizer = TfidfVectorizer()
x = tfidf_vectorizer.fit_transform(data['Original_Text']+ " " + data['suspicious_Text'])

In [12]:
y = data['Label']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=42)


In [14]:
#testing accuarcy of SVM
model = SVC(kernel= 'linear', random_state=42)

model.fit(x_train,y_train)
y_pred = model.predict(x_test)

print("accuracy", accuracy_score(y_test,y_pred))
print("classification", classification_report(y_test,y_pred))
print("confusion", confusion_matrix(y_test,y_pred))

accuracy 0.8765432098765432
classification               precision    recall  f1-score   support

           0       0.86      0.86      0.86        36
           1       0.89      0.89      0.89        45

    accuracy                           0.88        81
   macro avg       0.88      0.88      0.88        81
weighted avg       0.88      0.88      0.88        81

confusion [[31  5]
 [ 5 40]]


In [15]:
import pickle
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))

In [16]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

In [17]:
def detect(input_text):
    # Preprocess the input text
    processed_text = preprocess_text(input_text)
    # Vectorize the processed text
    vectorized_text = tfidf_vectorizer.transform([processed_text])
    # Predict using the SVM model
    result = model.predict(vectorized_text)
    # Return both the result and the preprocessed text
    return {
        "preprocessed_text": processed_text,
        "result": "Plagiarism detected" if result[0] == 1 else "No Plagiarism detected"
    }


In [18]:
#testing using example text
input_text = """
'Climate change is a pressing global issue that affects every corner of the world. The Earth\'s temperature is rising due to the accumulation of greenhouse gases in the atmosphere, primarily carbon dioxide and methane. This warming trend leads to severe consequences, including rising sea levels, extreme weather events, and disruptions to ecosystems. \"Human activities, such as burning fossil fuels and deforestation, are the main drivers of climate change\" (copied from a source). Addressing climate change requires a global effort to transition to renewable energy sources, reduce emissions, and protect natural carbon sinks like forests and oceans.'
'"""


detect(input_text)

{'preprocessed_text': 'climate change press global issue affect every corner world earths temperature rise due accumulation greenhouse gas atmosphere primarily carbon dioxide methane warm trend lead severe consequence include rise sea level extreme weather event disruption ecosystems human activity burn fossil fuel deforestation main driver climate change copy source address climate change require global effort transition renewable energy source reduce emission protect natural carbon sink like forest ocean',
 'result': 'Plagiarism detected'}

In [19]:
#testing using example text
input_text = ' My name is Hafeeza '
detect(input_text)

{'preprocessed_text': 'hafeeza', 'result': 'No Plagiarism detected'}