# Plagiarism detection
Natural Language Processing - Universidad Tecnológica Nacional

In [9]:
import pandas as pd
import re
import numpy as np 

## Pre-proccesing data

In [10]:
import os
import shutil
import time


from doc2docx import convert as doc2docx
from pptx import Presentation
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE as RT

path = './data/'
files = os.listdir(path)
author_synonyms = ['nombre','nombres','apellido','apellidos','nombre y apellido','apellido y nombre','nombres y apellidos','apellidos y nombres','alumno','alumnos', 'alumna','alumne','alumnes']

In [3]:
extensions = set([file.split('.')[-1] for file in files])
extensions


{'doc', 'docx', 'pdf', 'pptx'}

There are only 4 file extensions: 'doc', 'docx', 'pdf', 'pptx'. For each extension, we have to create a processing method. 

### Pre-processing data (Transforming to .docx)

In [51]:
from unicodedata import normalize
def clean_special_characters(string):
    """
    Receives a string and returns it without those specials characters which are troublesome
    """
    string = string.replace('\u200b',' ')
    string = string.replace('\xad',' ').replace('\xa0',' ')
    string = string.replace('\t',' ')
    string = re.sub('(\s*\n\s*)+', ' \n ', string)
    
    #Remove accents
    string = re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", string), 0, re.I)
    
    return string

In [212]:
from pdf2docx import Converter


def cleaning_citations(strings):
    strings = list(set(strings))
    
    strings = [string for string in strings if not string.startswith('mailto:')]
    return strings


def clean_special_characters(string):
    """
    Receives a string and returns it without those specials characters which are troublesome
    """
    string = string.replace('\u200b',' ')
    string = string.replace('\xad',' ').replace('\xa0',' ')
    string = string.replace('\t',' ')
    string = re.sub('(\s*\n\s*)+', ' \n ', string)
    
    #Remove accents
    string = re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", string), 0, re.I)
    
    return string

def text_hyperlinks(string):
    """
    Return a list of hyperlinks in a string
    """

    #Regular expression of hyperlinks
    regex = r"(https?://\S+)"

    #Find all hyperlinks in a string
    hyperlinks = re.findall(regex,string)
    
    return hyperlinks

def pdf2docx(pdf_files):
    filenames = [os.path.splitext(doc)[0] for doc in pdf_files]
    path_docx = './data_temp/docxs_files/'
    path_pdf = './data_temp/pdfs_files/'
    for file in filenames:
        cv = Converter(path_pdf + file + '.pdf')
        cv.convert(path_docx + file + '.docx', multi_processing=True)
        cv.close()

def docx_hyperlinks(doc):
    hyperlinks = []
    rels = doc.part.rels
    for rel in rels:
        if rels[rel].reltype == RT.HYPERLINK:
            hyperlinks.append(rels[rel]._target)
    return hyperlinks     

def names_in_table(tables):

    names = []
    for table in tables:
        #Look if in the headers there is a synonym for author
        for i, cell in enumerate(table.rows[0].cells):
            if cell.text.lower() in author_synonyms:
                #Looking for names in the rows
                for row in table.rows[1:]:
                    names.append(row.cells[i].text)
    return names

def get_headers(doc):
    headers = []
    section = doc.sections[0]
    header = section.header
    for paragraph in header.paragraphs:
        if paragraph.text not in headers:
            headers.append(paragraph.text)
    return headers if headers != [''] else []

def get_footers(doc):
    footers = []
    section = doc.sections[0]
    footer = section.footer
    for paragraph in footer.paragraphs:
        if paragraph.text not in footers:
            footers.append(paragraph.text)
    return footers if footers != [''] else []

def remove_section(string, list_sections):
    """
    Receives a string and a list of strings belonging to a section and returns the string without these section
    """
    for section in list_sections:
        string = string.replace(section,'')
    return string

def read_docx(file):
    doc = Document(file)
    hyperlinks = []
    text = ""

    headers = get_headers(doc)
    footers = get_footers(doc)

    names = names_in_table(doc.tables)
    hyperlinks += (docx_hyperlinks(doc))
    for paragraph in doc.paragraphs:
        paragraph_text = paragraph.text
        paragraph_text = remove_section(paragraph_text, headers)
        paragraph_text = remove_section(paragraph_text, footers)
        hyperlinks += text_hyperlinks(paragraph_text)
        text += paragraph_text + ' \n '
    text = clean_special_characters(text)

    headers = [clean_special_characters(header) for header in headers]

    hyperlinks = cleaning_citations(hyperlinks)
    df = pd.DataFrame({'filename': path_to_filename(file), 'text': [text],'author':[names] if names else np.nan, 'citations': [hyperlinks] if hyperlinks else np.nan, 'headers': [headers] if headers else np.nan})
    return df

def path_to_filename(path):
    """
    Return the file name from a path
    """
    return os.path.splitext(os.path.basename(path))[0]

def read_pptx(path):
    """
    Read a pptx file and return a dataframe with the file name, the text and the citations
    """

    text = ""
    hyperlinks = []

    #Reading the document by slides 
    for slide in Presentation(path).slides:
        #Looking for shapes which have text and extracting text and hyperlinks from them
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        
                        #Looking for hyperlinks in the text
                        hyperlinks = hyperlinks + text_hyperlinks(run.text)
                        
                        #Cleaning special characters
                        text = text +'. ' + run.text
    hyperlinks = list(set(hyperlinks))
    df = pd.DataFrame({'filename': path_to_filename(path), 'text': [text], 'citations': [hyperlinks] if hyperlinks else None})

    return df


def read_pdf_docs(docs, df):
    """
    Receive a list of .doc and .pdf files, convert them to .docx files and read them,
    then concatenate the new dataframes of each new .docx file into a dataframe and return it
    """

    dot_docs = [doc for doc in docs if doc.endswith('.doc')]
    dot_pdfs = [doc for doc in docs if doc.endswith('.pdf')]

    #If there is a temporal folder, remove it
    if os.path.isdir('./data_temp'):

        #Return files to their original folder and remove the temporal folder
        for doc in dot_docs:
            shutil.move('./data_temp/docs_files/' + doc, path)
        for pdf in dot_pdfs:
            shutil.move('./data_temp/pdfs_files/' + pdf, path)
        shutil.rmtree('./data_temp')

    os.mkdir('./data_temp/')    
    os.mkdir('./data_temp/docs_files')
    os.mkdir('./data_temp/pdfs_files')
    os.mkdir('./data_temp/docxs_files')

    #Move files into its temporal folders
    for doc in dot_docs:
        shutil.move(path + doc, './data_temp/docs_files/')
    for doc in dot_pdfs:
        shutil.move(path + doc, './data_temp/pdfs_files/')

    try:

        #Converting .doc files to .docx files and then to .pdf files
        doc2docx('./data_temp/docs_files/', './data_temp/docxs_files/')
        time.sleep(.5)
        pdf2docx(dot_pdfs)
        time.sleep(.5)

        #Read each .pdf file and concatenate the dataframes
        for file in os.listdir('./data_temp/docxs_files/'):
            file_df = read_docx('./data_temp/docxs_files/'+file)
            df = pd.concat([df,file_df], ignore_index=True)

    except:

        #Return files to their original folder and remove the temporal folder
        for doc in dot_docs:
            shutil.move('./data_temp/docs_files/' + doc, path)
        for doc in dot_pdfs:
            shutil.move('./data_temp/pdfs_files/' + doc, path)
        shutil.rmtree('./data_temp')
        raise Exception('ERROR: Could not convert files to .pdf files and read them')

    #Return files to their original folder and remove the temporal folder
    for doc in dot_docs:
        shutil.move('./data_temp/docs_files/' + doc, path)
    for doc in dot_pdfs:
        shutil.move('./data_temp/pdfs_files/' + doc, path)
    shutil.rmtree('./data_temp')

    return df


In [None]:

df = pd.DataFrame(columns=['filename','text','citations','headers'])

#Create a list of .doc files and concatenate its content to the dataframe
docs_and_docxs = [file for file in files if file.endswith('.doc') or file.endswith('.pdf')]
df = read_pdf_docs(docs_and_docxs, df)

#Create a list of files which are not .doc
not_docs_or_docxs = [file for file in files if file not in docs_and_docxs]

#Read each file depending on its extension and concatenate its content to the dataframe
for file in not_docs_or_docxs:

    file_path = path + file
    if file.endswith('.docx'):
        file_df = read_docx(file_path)
    elif file.endswith('.pptx'):
        file_df = read_pptx(file_path)
    else:
        raise Exception('FILE EXTENSION NOT SUPPORTED: Only support .docx, .doc, .pdf and .pptx files')
    
    df = pd.concat([df,file_df], ignore_index=True)

#Export the dataframe to a csv file
df.to_csv('./data.csv', index=False)

## Processing data

### To Do:

- Look for topic

In [11]:
import spacy
import ast
import numpy as np
from bs4 import BeautifulSoup

from googlesearch import search
from nltk import word_tokenize
from nltk import sent_tokenize
from urllib.request import urlopen, Request

df = pd.read_csv('./data.csv')
nlp = spacy.load('es_core_news_lg')


In [12]:
df[df.author.notnull()].head()

Unnamed: 0,filename,text,citations,headers,author
7,Lopez Tomas - TP 6 - Sistemas Emergentes,\n Marketing en Internet y \n Nueva Economia ...,,,"['LÓPEZ, Tomas']"
18,TP 0 Gabriela Gonzalez MKTG y NV Economía,\n TRABAJO PRACTICO 0 \n Integrante – Año 20...,['https://www.fundacionaquae.org/wiki-aquae/in...,,"['Gonzalez, Gabriela']"
45,TP N° 1 – WIKINOMICS - Melanie Blejter,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,"['NOMBRE Y APELLIDO', 'Melanie Blejter ', 'LEG..."
46,TP N° 2 – La larga cola - Melanie Blejter,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,"['NOMBRE Y APELLIDO', 'Melanie Blejter ', 'LEG..."
47,TP N° 3 – The Experience Economy - Melanie Ble...,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,"['NOMBRE Y APELLIDO', 'Melanie Blejter ', 'LEG..."


In [13]:
# Convert the string representation of the list back to a list
df.loc[df.citations.notnull(), 'citations'] = df[df.citations.notnull()]['citations'].apply(ast.literal_eval)
df.loc[df.author.notnull(), 'author'] = df[df.author.notnull()]['author'].apply(ast.literal_eval)
df.loc[df.headers.notnull(), 'headers'] = df[df.headers.notnull()]['headers'].apply(ast.literal_eval)

df.loc[df.headers.notnull(), 'headers']=df[df.headers.notnull()].headers.apply(lambda x: [header for header in x if header])

In [14]:
def delete_not_author(strings):
    messy_author_strings = ['nombre','nombres','apellido','apellidos','nombre y apellido','apellido y nombre','nombres y apellidos','apellidos y nombres','alumno','alumnos', 'alumna','alumne','alumnes','legajo','email','mail','correo electronico','e-mail']
    new_string=[]
    for string in strings:
        if not(any(word in string.lower() for word in messy_author_strings) or any(char.isdigit() for char in string) or re.match('^[\w\.-]+@[\w\.-]+\.\w+$',string)):
            new_string.append(string)
    return new_string if new_string else np.nan

df.loc[df.author.notnull(),'author'] = df[df.author.notnull()]['author'].apply(delete_not_author)

In [15]:
df[df.author.notnull()].head()

Unnamed: 0,filename,text,citations,headers,author
7,Lopez Tomas - TP 6 - Sistemas Emergentes,\n Marketing en Internet y \n Nueva Economia ...,,,"[LÓPEZ, Tomas]"
18,TP 0 Gabriela Gonzalez MKTG y NV Economía,\n TRABAJO PRACTICO 0 \n Integrante – Año 20...,[https://www.fundacionaquae.org/wiki-aquae/inn...,,"[Gonzalez, Gabriela]"
45,TP N° 1 – WIKINOMICS - Melanie Blejter,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,[Melanie Blejter ]
46,TP N° 2 – La larga cola - Melanie Blejter,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,[Melanie Blejter ]
47,TP N° 3 – The Experience Economy - Melanie Ble...,\n MARKETING EN INTERNET Y NUEVA ECONOMIA \n ...,,,[Melanie Blejter ]


In [16]:
def correct_paragraphs(string):
    """
    Corrects the paragraph by validating \n
    """
    string = re.compile('\s*\n').sub('', string, 1)
    string = re.sub('(\w\s)\n\s(\w)', r'\1\2',string)
    return string

df['text'] = df['text'].apply(correct_paragraphs)

In [17]:
df[df.author.notnull()].head()

Unnamed: 0,filename,text,citations,headers,author
7,Lopez Tomas - TP 6 - Sistemas Emergentes,Marketing en Internet y Nueva Economia Trabaj...,,,"[LÓPEZ, Tomas]"
18,TP 0 Gabriela Gonzalez MKTG y NV Economía,TRABAJO PRACTICO 0 Integrante – Año 2019 Pro...,[https://www.fundacionaquae.org/wiki-aquae/inn...,,"[Gonzalez, Gabriela]"
45,TP N° 1 – WIKINOMICS - Melanie Blejter,MARKETING EN INTERNET Y NUEVA ECONOMIA Catedr...,,,[Melanie Blejter ]
46,TP N° 2 – La larga cola - Melanie Blejter,MARKETING EN INTERNET Y NUEVA ECONOMIA Docent...,,,[Melanie Blejter ]
47,TP N° 3 – The Experience Economy - Melanie Ble...,MARKETING EN INTERNET Y NUEVA ECONOMIA Catedr...,,,[Melanie Blejter ]


### Author name

In [18]:
def first_contiguos_persons(listed_doc):
    """
    Return the start and end index of the first contiguos persons in a list of spacy spans
    """
    try:
        doc_ents = [span.ent_type_ for span in listed_doc]

        #Looks for the index of the first PER entity
        index_start = doc_ents.index('PER')

        #Loop through entities until it finds a non PER entity
        for i, slice in enumerate(doc_ents[index_start:]):
            index_end = index_start + i
            if slice != 'PER':
                break
        return (index_start, index_end)
    
    except:

        #If there are not any PER entities, return -1
        index_start = -1
        index_end = -1
        return (index_start, index_end)

def slice_string(string):
    """
    Receives a string a return it sliced where the author is supposed to be
    """
    string = string.split('\n')
    i = 0
    while(i<len(string)):
        slice = string[i]
        slice = slice.lower().strip()
        for synonym in author_synonyms:

            #Joining the author synonym is in another line that the author name and we need some context for entities recognition
            if slice.endswith(synonym) or slice.endswith(synonym + ":"):
                return ' '.join(string[i: i+2])
            
            #If the author is in the same line as the synonym
            if synonym in slice:
                return slice
        i +=1
    return ' '.join(string)

def get_author(string, headers):
    """
    Look for authors in a documents and return a list of them.
    If it has trouble finding the author, it a list of first contiguos persons entities.
    Also if there are not any persons entities, it returns np.nan
    """
    stop_author = ['legajo', 'email', 'mail', 'correo electronico', 'e-mail']
    author_synonyms = ['nombre','nombres','apellido','apellidos','nombre y apellido','apellido y nombre','nombres y apellidos','apellidos y nombres','alumno','alumnos', 'alumna','alumne','alumnes']
    string = slice_string(string)
    author = [] 
    doc = nlp(string)
    author_synonyms = [author for author in author_synonyms if author in doc.text.lower()]

    for i,token in enumerate(doc):
        #If there is any author synonym, look for the first contiguos persons before it
        if token.text.lower() in author_synonyms:
            sliced_doc = doc[i+1:]
            doc_list = [doc for doc in sliced_doc if (doc.text.lower() not in stop_author) and not(doc.pos_ == 'PUNCT' or doc.pos_ == 'SPACE')]
            index = first_contiguos_persons(doc_list)
            author += [doc.text for doc in doc_list[index[0]:index[1]]]

            return author
    
    #If there are not any author synonyms, look into haeders the first contiguos persons
    if author == [] and not (type(headers) == float and pd.isna(headers)):
        
        for header in headers:
            doc = nlp(header)
            for i, token in enumerate(doc):
                if token.text.lower() in author_synonyms:
                    doc = doc[i+1:]
                    doc_list = [token for token in doc if (doc.text.lower() not in stop_author) and not(doc.pos_ == 'PUNCT' or doc.pos_ == 'SPACE')]
                    index = first_contiguos_persons(doc_list)
                    author += [doc.text for doc in doc_list[index[0]:index[1]]]
    elif author == []:

        author = np.nan
    
    return author

df['author'] = df.apply(lambda x : get_author(x['text'], x['headers']),axis=1)

### Topic

In [19]:
def clean_sentences(string):
    """
    Clean a string from special characters and numbers
    """
    string = re.sub('([a-zA-Z])-([a-zA-Z])', r'\1\2',string)
    string = string.replace('\n',' ')
    string = re.sub('●|•|-|”|“|°|,|/|:|\?|¿|!|¡',' ', string)
    string = string.replace('(',' ').replace(')',' ').replace('[',' ').replace(']',' ').replace('{',' ').replace('}',' ')
    string = re.sub('\d', ' ', string)
    string = re.sub('\s+',' ',string)
    string = string.strip()
    return string


In [20]:
from collections import Counter

def get_topic(text):
    """
    Get the most common words in a corpus
    """
    text = clean_sentences(text)
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.pos_ != 'SPACE' and token.pos_ != 'PUNCT' and len(token.text)>1]
    topic = Counter(tokens).most_common(10)
    topic = [token[0] for token in topic]
    return topic

df['topic'] = df['text'].apply(get_topic)

In [21]:
print(df['topic'].iloc[3])

['sistema', 'emergente', 'complejo', 'describa', 'estudio', 'conducta', 'moho', 'fango', 'organismo', 'forma']


#### Looking for topics in Google

In [22]:
def google_topics(topic):
    """
    Look for topics in google and return the first 3 results
    """
    query = ' '.join(topic)
    urls = [url for url in search(query, num_results=3)][0:3]
    return urls

### Processing corpus

In [23]:
def correct_dots(string):
    """
    Only keeping dots for sentences separation
    """
    string = re.sub('([a-zA-Z]\s?)\.(\s?[a-z])', r'\1 \2', string)
    string = re.sub('([a-zA-Z]\s?)\.(\s?\))', r'\1 \2', string)
    string = re.sub('(\s[A-Z]\s?)\.(\s?[a-zA-Z])', r'\1 \2', string)
    string = string.replace('\n', ' ')
    return string

def get_corpus(text):
    """
    Get a list of sentences well separated from a text
    """
    text = correct_dots(text)
    corpus = text.split('.')

    return corpus

def process_indexed_corpus(corpus):
    """
    Receive a list of sentences and return a list of cleaned sentences with each index
    """
    processed_corpus = []
    for i, sentence in enumerate(corpus):
        sentence = clean_sentences(sentence)
        sentence = nlp(sentence)
        sentence = [token.lemma_.lower() for token in sentence if not token.is_stop and token.pos_ != 'SPACE' and token.pos_ != 'PUNCT' and len(token.text)>1]
        if len(sentence) > 3:
            sentence = ' '.join(sentence)
            processed_corpus.append((i, sentence))
    return processed_corpus


In [24]:
df['corpus'] = df['text'].apply(get_corpus)
df['processed_corpus'] = df['corpus'].apply(process_indexed_corpus)

In [25]:
df

Unnamed: 0,filename,text,citations,headers,author,topic,corpus,processed_corpus
0,Economía de experiencia (1),Marketing en internet y nueva economia Economi...,,,"[Pablo, Gabriel]","[cliente, experiencia, personalizacion, produc...",[Marketing en internet y nueva economia Econom...,"[(0, marketing internet economia economia expe..."
1,Economía de experiencia,Marketing en internet y nueva economia Economi...,,,"[Pablo, Gabriel]","[cliente, experiencia, personalizacion, produc...",[Marketing en internet y nueva economia Econom...,"[(0, marketing internet economia economia expe..."
2,K5071 - Matias David Choren - TP N6 Sistemas E...,MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CUA...,,,"[matias, david, choren]","[sistema, emergente, complejo, describa, estud...",[ MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CU...,"[(0, marketing internet economia cuatrimestre ..."
3,K5071 - Matias David Choren - TP N6 Sistemas E...,MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CUA...,,,"[matias, david, choren]","[sistema, emergente, complejo, describa, estud...",[ MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CU...,"[(0, marketing internet economia cuatrimestre ..."
4,K5071 - Matias David Choren - TP N°5 Rifkin (1),MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CUA...,,,"[matias, david, choren]","[rifkin, produccion, actual, economia, revoluc...",[ MARKETING EN INTERNET Y NUEVA ECONOMIA 2° CU...,"[(0, marketing internet economia cuatrimestre ..."
...,...,...,...,...,...,...,...,...
302,Trabajo Práctico 2 - Hernan Dalle Nogare,Trabajo Practico 2 Hernan Dalle Nogare - 146.8...,,,,"[marketing, empresa, cliente, cambio, consumid...","[Trabajo Practico 2 Hernan Dalle Nogare - 146,...","[(0, trabajo practico hernan dalle nogare), (2..."
303,Trabajo Práctico 3 - Hernan Dalle Nogare (1),Trabajo Practico 3 Hernan Dalle Nogare - 146.8...,,,,"[experiencia, cliente, personalizacion, produc...","[Trabajo Practico 3 Hernan Dalle Nogare - 146,...","[(0, trabajo practico hernan dalle nogare), (1..."
304,Trabajo Práctico 3 - Hernan Dalle Nogare (2),Trabajo Practico 3 Hernan Dalle Nogare - 146.8...,,,,"[experiencia, cliente, personalizacion, produc...","[Trabajo Practico 3 Hernan Dalle Nogare - 146,...","[(0, trabajo practico hernan dalle nogare), (1..."
305,Trabajo Práctico 4 - Hernan Dalle Nogare,Trabajo Practico 4 Hernan Dalle Nogare - 146.8...,"[https://datosmacro.expansion.com/pib, https:/...",,,"[online, venta, ecommerce, internet, usuario, ...","[Trabajo Practico 4 Hernan Dalle Nogare - 146,...","[(0, trabajo practico hernan dalle nogare), (1..."


#### Reading URLs

In [47]:
def read_urls(urls):
    """
    Read the text of each hyperlink and concatenate it to the dataframe
    """

    df = pd.DataFrame(columns=['url','corpus', 'processed_corpus'])
    for url in urls:
            try:
                req = Request(
                            url=url,
                            headers={'User-Agent': 'Mozilla/5.0'})
                html = urlopen(req).read()
            except:
                continue
            soup = BeautifulSoup(html, features="html.parser")

            #Kill all script and style elements
            for script in soup(["script", "style"]):
                script.extract()

            #Get text
            text = soup.get_text()

            #Break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            
            #Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

            #Drop blank lines
            text = '. '.join(chunk for chunk in chunks if chunk)
            text = clean_special_characters(text)
            corpus = get_corpus(text)
            processed_corpus = process_indexed_corpus(corpus)
            

            df = pd.concat([df, pd.DataFrame({'url':url,'corpus':[corpus], 'processed_corpus': [processed_corpus]}, index=[0])], ignore_index=True)
    return df

### Plagiarism

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
def append_to_dictionary(dic, key, index, element):
    dic.setdefault(key, {'n_sentence': index, 'plagiarism': []})
    dic[key]['plagiarism'] += element

In [71]:
def db_similarities(df, comparing_df, similarities):
    
    unprocessed_corpus = df['corpus']
    processed_corpus = df['processed_corpus']
    
    corpus = [slice[1] for slice in processed_corpus]
    
    for _, row in comparing_df.iterrows():


        row_unprocessed_corpus = row['corpus']
        row_indexed_corpus = row['processed_corpus']
        row_corpus = [slice[1] for slice in row_indexed_corpus]

        vectorizer = CountVectorizer()
        vectorizer.fit(row_corpus + corpus)

        for slice in processed_corpus:
            index, sentence = slice

            plagiarism=[]
            for id, row_slice in enumerate(row_indexed_corpus):
                row_index, row_sentence = row_slice
                vector = vectorizer.transform([row_sentence, sentence])
                score = cosine_similarity(vector)[0][1]
                if 0.95 > score > 0.70:
                    plagiarism.append({'plagiarized_sentence':row_unprocessed_corpus[row_index],
                                      'plagiarism_score':score, 
                                      'plagiarized_file':row['filename'],  
                                      'plagiarized_author':row['author'] 
                                    })
                    
            if len(plagiarism) > 0:
                append_to_dictionary(similarities, unprocessed_corpus[index], index, plagiarism)
                    
    return similarities


In [75]:
def url_similarities(df, urls, similarities):
    

    unprocessed_corpus = df['corpus']
    processed_corpus = df['processed_corpus']
    
    comparing_df = read_urls(urls)

    corpus = [slice[1] for slice in processed_corpus]
    
    for _, row in comparing_df.iterrows():


        row_unprocessed_corpus = row['corpus']
        row_indexed_corpus = row['processed_corpus']
        row_corpus = [slice[1] for slice in row_indexed_corpus]

        vectorizer = CountVectorizer()
        vectorizer.fit(row_corpus + corpus)

        for slice in processed_corpus:
            index, sentence = slice

            plagiarism=[]
            for id, row_slice in enumerate(row_indexed_corpus):
                row_index, row_sentence = row_slice
                vector = vectorizer.transform([row_sentence, sentence])
                score = cosine_similarity(vector)[0][1]
                if 0.95 > score > 0.70:
                    #similarity[str(id)]
                    plagiarism.append({'plagiarized_sentence':row_unprocessed_corpus[row_index],
                                      'plagiarism_score':score, 
                                      'plagiarized_website':row['url'] 
                                   })
                    #similarities.append(similarity)
            if len(plagiarism) > 0:
                append_to_dictionary(similarities, unprocessed_corpus[index], index, plagiarism)
                    
                    
    return similarities

In [76]:
def googled_topic_similarities(df, topic, similarities):
    
    urls = google_topics(topic)
    return url_similarities(df, urls, similarities)


In [None]:
def get_similarities(df, comparing_df):
    similarities = {}
    db_similarities(df[['corpus','processed_corpus']], comparing_df, similarities)

    if df['citations'] == True:
        url_similarities(df[['corpus', 'processed_corpus']], df['citations'],similarities)
        
    googled_topic_similarities(df[['corpus', 'processed_corpus']],df['topic'],similarities)
    return similarities



#### Run

In [None]:
df_test = df.iloc[273]

text = df_test['text']

topic = get_topic(text)
corpus = get_corpus(text)
processed_corpus = process_indexed_corpus(corpus)

#Filtering by the intersection of topics
filtered_df = df[df['topic'].apply(lambda x: len(set(x) & set(topic)) > 3)][df['filename'] != df_test['filename']]

similarity = get_similarities(df_test, filtered_df)

In [89]:
import json
with open('results.json', 'w') as results_file:
    json.dump(similarity, results_file, indent=4, sort_keys=True)