In [1]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv("dataset.csv")
print(data.head())

   Unnamed: 0                                        source_text  \
0           0  Researchers have discovered a new species of b...   
1           1  The moon orbits the Earth in approximately 27....   
2           2  Water is composed of two hydrogen atoms and on...   
3           3          The history of Rome dates back to 753 BC.   
4           4  Pluto was once considered the ninth planet in ...   

                                    plagiarized_text  label  
0  Scientists have found a previously unknown but...      1  
1  Our natural satellite takes around 27.3 days t...      1  
2  H2O consists of 2 hydrogen atoms and 1 oxygen ...      1  
3  Rome has a long history that can be traced bac...      1  
4  In the past, Pluto was classified as the ninth...      1  


In [3]:
data.shape

(370, 4)

In [4]:
data['label'].value_counts

<bound method IndexOpsMixin.value_counts of 0      1
1      1
2      1
3      1
4      1
      ..
365    0
366    0
367    0
368    0
369    0
Name: label, Length: 370, dtype: int64>

In [5]:
def pro_text(text):
    #text= text.translate(str.maketrans("","", string.punctuation))
    text = ''.join(char for char in text if char not in string.punctuation)
    text= text.lower()
    stop_words= set(stopwords.words('english'))
    text= " ".join((word for word in text.split() if word not in stop_words))

    return text

pro_text("This is #$%^@%?! MY TEXT to use my dummy ")

'text use dummy'

In [6]:
data['source_text']  = data['source_text'].apply(pro_text)
data['plagiarized_text']  = data['plagiarized_text'].apply(pro_text)

In [21]:
data['source_text'][50]

'honey bees communicate series dance movements'

In [8]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])

In [9]:
y =data['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("accuracy score", accuracy_score(y_test, y_pred))
print("classification report", classification_report(y_test, y_pred))
print("confusion " ,confusion_matrix(y_test, y_pred))


accuracy score 0.8513513513513513
classification report               precision    recall  f1-score   support

           0       0.78      0.94      0.85        34
           1       0.94      0.78      0.85        40

    accuracy                           0.85        74
   macro avg       0.86      0.86      0.85        74
weighted avg       0.87      0.85      0.85        74

confusion  [[32  2]
 [ 9 31]]


In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators= 100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("accuracy score", accuracy_score(y_test, y_pred))
print("classification report", classification_report(y_test, y_pred))
print("confusion " ,confusion_matrix(y_test, y_pred))

accuracy score 0.7567567567567568
classification report               precision    recall  f1-score   support

           0       0.65      1.00      0.79        34
           1       1.00      0.55      0.71        40

    accuracy                           0.76        74
   macro avg       0.83      0.78      0.75        74
weighted avg       0.84      0.76      0.75        74

confusion  [[34  0]
 [18 22]]


In [13]:
from sklearn.svm import SVC

model=SVC(kernel = 'linear')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("accuracy score", accuracy_score(y_test, y_pred))
print("classification report", classification_report(y_test, y_pred))
print("confusion " ,confusion_matrix(y_test, y_pred))


accuracy score 0.8648648648648649
classification report               precision    recall  f1-score   support

           0       0.82      0.91      0.86        34
           1       0.92      0.82      0.87        40

    accuracy                           0.86        74
   macro avg       0.87      0.87      0.86        74
weighted avg       0.87      0.86      0.87        74

confusion  [[31  3]
 [ 7 33]]


In [14]:
import pickle

pickle.dump(model,open("model.pkl",'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))

In [15]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

In [16]:
def detect(input_text):
    vectorized_text= tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No Detection"

In [17]:
input_text="playing musical instruments enhances creativity"
detect(input_text)

'No Detection'