In [11]:
import cProfile
import pandas as pd
#import sys
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string
import concurrent.futures
import time
import optparse


## Implement the string data

def remove_punctuations(txt, punct = string.punctuation):
    '''
    This function will remove punctuations from the input text
    '''
    return ''.join([c for c in txt if c not in punct])

def remove_stopwords(txt, sw = list(stopwords.words('english'))):
    '''
    This function will remove the stopwords from the input txt
    '''
    return ' '.join([w for w in txt.split() if w.lower() not in sw])


en_nlp = spacy.load('en_core_web_sm')

def clean_text(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)
    txt = remove_stopwords(txt)
    return txt.lower()

def clean_text_vi(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)

    return txt.lower()
def lemma_text(text_line):
    #test_file_clean = []
    doc_spacy = en_nlp(text_line)
    line = [ a.lemma_ for a in doc_spacy]
    #sentence = " ".join(line)
    #test_file_clean.append(sentence)
    return " ".join(line)



vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)

class Checker():
    ## Connect to the MySQL server
    engine = create_engine('mysql://root:kakalot123@localhost:3306/db')
    # book_id =177
    # language = 'en'
    def __init__(self,version,score,test_id):
        self.version = version
        self.score = score
        self.test_id = test_id
        if(self.version =='esv'):
            self.book_id =177
            self.language ='en'
        elif(self.version =='vie2010'):
            self.book_id =397
            self.language ='vi'
        self.train_query = """select verses.id,verses.text from verses   where verses.book_id!={} and books.language='{}' and books.version='{}'""".format(self.book_id,self.language,self.version)
        self.train_model = pd.read_sql_query(self.train_query, self.engine)
        # print(self.test_id)
        # print(self.version)
        # print(self.book_id)
        # print(self.language)
        self.test_query = """select verses.id,verses.textfrom verses  inner join books on verses.book_id = books.id where verses.book_id={} and books.language='{}' and verses.id ={} and books.version='{}'""".format(self.book_id,self.language,self.test_id,self.version)
        self.test_model = pd.read_sql_query(self.test_query,self.engine).drop(['books_id','name','short_name','version','language'],axis=1)
        self.text = self.train_model['text'].tolist()
        self.verses_id = self.train_model['id'].tolist()
        self.s_vector = list(zip(self.verses_id,self.text))
        self.test_file = self.test_model['text'].apply(str)

    def check_plagiarism(self,train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score
    def show_result(self,train_file,test_file):

        score = self.check_plagiarism(train_file,test_file,vectorizer1)
        if score >float(self.score):
            score = self.check_plagiarism(train_file,test_file,vectorizer2)
        return score



    #def check_plagiarism_soft(self,test_file,s_vector):
    def check_plagiarism_soft(self,test_file,text,verses_id):
        self.test_file = test_file
        self.text = text
        self.verses_id = verses_id
        #self.s_vector = s_vector
        result = {"id":[],"score":[]}

        if(self.version=='esv'):
            self.text = [clean_text(txt) for txt in self.text]
            self.text = [lemma_text(txt) for txt in self.text ]
            self.test_file = clean_text(self.test_file)
            self.test_file = lemma_text(self.test_file)
        elif(self.version =='vie2010'):
            self.test_file = clean_text_vi(self.test_file)
        test_file_list =[]
        for i in range(len(self.text)):
            test_file_list.append(self.test_file)
        result['id'] = verses_id
        with concurrent.futures.ProcessPoolExecutor() as executor:
            score = executor.map(self.show_result,self.text,test_file_list)
        result['score'] = score

#         for a,v_vector in self.s_vector:
#             test_file = clean_text(self.test_file)

#             if(self.version=='esv'):
#                 v_vector = clean_text(v_vector)
#                 test_file = lemma_text(test_file)
#                 v_vector = lemma_text(v_vector)
#             elif(self.version =='vie2010'):
#                 v_vector = clean_text_vi(v_vector)
#             score = self.show_result(v_vector,test_file)
#             result['id'].append(a)
#                 #score.append(executor.submit(self.show_result, train_file=v_vector,test_file=test_file))
#             result['score'].append(score)

        #     result['score'] = score
        dt = pd.DataFrame(result)
        result_file =[dt.loc[dt['score']==max(dt['score'])]['id'].tolist()[0],dt.loc[dt['score']==max(dt['score'])]['score'].tolist()[0]]
        return result_file


    def show_final_result(self):
        #result = self.check_plagiarism_soft(self.test_file,self.s_vector)
        result = self.check_plagiarism_soft(self.test_file,self.text,self.verses_id)
        result_model = pd.read_sql_query(f"select verses.id,verses.book_id,verses.chapter,verses.number,verses.text from verses where verses.id={result[0]}", self.engine)
        result_model['score'] = result[1]
        df = result_model.to_json(orient='table',index=False)
        return df






def main():
    start = time.time()
    parser = optparse.OptionParser()

    # add options
    parser.add_option('-v','--version', dest = 'version',
                      type = 'string',
                      help = 'choose english or vietnamese version')
    parser.add_option('-t','--testid', dest = 'testid',
                      type = 'string',
                      help = 'please enter the test id')
    parser.add_option('-s','--score', dest = 'score',
                      type = 'string',
                      help = 'please enter the algorithm score')


    (options, args) = parser.parse_args()
    if (options.version == None):
            print ("Version is not null")
            exit(0)
    else:
            version = options.version

    if (options.testid == None):
            print ("Test id is not null")
            exit(0)
    else:
            testid = options.testid

    if (options.score == None):
            print ("The score is not null")
            exit(0)
    else:
            score = options.score
    my_checker = Checker(version,score,testid)
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    cProfile.run('main()')

         431 function calls in 0.000 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <ipython-input-11-0595df672f15>:164(main)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
       16    0.000    0.000    0.000    0.000 _collections_abc.py:657(get)
        4    0.000    0.000    0.000    0.000 gettext.py:212(_expand_lang)
        4    0.000    0.000    0.000    0.000 gettext.py:538(find)
        4    0.000    0.000    0.000    0.000 gettext.py:579(translation)
        4    0.000    0.000    0.000    0.000 gettext.py:660(dgettext)
        4    0.000    0.000    0.000    0.000 gettext.py:735(gettext)
        4    0.000    0.000    0.000    0.000 iostream.py:197(schedule)
        3    0.000    0.000    0.000    0.000 iostream.py:310(_is_master_process)
        3    0.000    0.000    0.000    0.000 iostream.py:323(_schedule_flush)
        3    0.000    0.000    0

Usage: ipykernel_launcher.py [options]

ipykernel_launcher.py: error: no such option: -f


In [3]:
def main():
    start = time.time()
    parser = optparse.OptionParser()

    # add options
    parser.add_option('-v','--version', dest = 'version',
                      type = 'string',
                      help = 'choose english or vietnamese version')
    parser.add_option('-t','--testid', dest = 'testid',
                      type = 'string',
                      help = 'please enter the test id')
    parser.add_option('-s','--score', dest = 'score',
                      type = 'string',
                      help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    import cProfile, pstats
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats('ncalls')
    stats.print_stats()

vie2010
0.3
199107
{"schema":{"fields":[{"name":"id","type":"integer"},{"name":"book_id","type":"integer"},{"name":"chapter","type":"integer"},{"name":"number","type":"integer"},{"name":"text","type":"string"},{"name":"score","type":"number"}],"pandas_version":"0.20.0"},"data":[{"id":199539,"book_id":398,"chapter":1,"number":1,"text":"Phao-l\u00f4, b\u1edfi \u00fd \u0111\u1ecbnh c\u1ee7a \u0110\u1ee9c Ch\u00faa Tr\u1eddi, \u0111\u01b0\u1ee3c k\u00eau g\u1ecdi l\u00e0m s\u1ee9 \u0111\u1ed3 c\u1ee7a \u0110\u1ea5ng Christ J\u00easus, c\u00f9ng v\u1edbi S\u1ed1t-then, anh em ch\u00fang ta,","score":1.0}]}
It takes:  54.878434896469116
         69998551 function calls (69813469 primitive calls) in 54.878 seconds

   Ordered by: call count

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
6186759/6094944    1.109    0.000    1.896    0.000 {built-in method builtins.isinstance}
4965278/4965083    0.407    0.000    0.407    0.000 {built-in method builtins.len}
  4553912 

        9    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\event\attr.py:264(__bool__)
        9    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\engine\result.py:125(__len__)
        9    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\MySQLdb\cursors.py:78(close)
        9    0.000    0.000    0.017    0.002 C:\Users\Computer\anaconda3\lib\site-packages\MySQLdb\connections.py:250(query)
        9    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\MySQLdb\connections.py:241(cursor)
        9    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\numexpr\necompiler.py:94(__init__)
      9/3    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\numexpr\necompiler.py:127(__hash__)
        9    0.000    0.000    0.000    0.000 {method 'affected_rows' of '_mysql.connection' objects}
     

        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:718(<dictcomp>)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:679(__init__)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:882(<genexpr>)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:884(<genexpr>)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:880(_bind_processors)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:906(construct_params)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\distutils\version.py:51(__lt__)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda

In [13]:
def main():
    start = time.time()
    parser = optparse.OptionParser()

    # add options
#     parser.add_option('-v','--version', dest = 'version',
#                       type = 'string',
#                       help = 'choose english or vietnamese version')
#     parser.add_option('-t','--testid', dest = 'testid',
#                       type = 'string',
#                       help = 'please enter the test id')
#     parser.add_option('-s','--score', dest = 'score',
#                       type = 'string',
#                       help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
#     import cProfile, pstats
#     profiler = cProfile.Profile()
#     profiler.enable()
    main()
#     profiler.disable()
#     stats = pstats.Stats(profiler).sort_stats('cumtime')
#     stats.print_stats()

vie2010
0.3
199107


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [18]:
import cProfile
import pandas as pd
#import sys
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string
import concurrent.futures
import time
import optparse


## Implement the string data

def remove_punctuations(txt, punct = string.punctuation):
    '''
    This function will remove punctuations from the input text
    '''
    return ''.join([c for c in txt if c not in punct])

def remove_stopwords(txt, sw = list(stopwords.words('english'))):
    '''
    This function will remove the stopwords from the input txt
    '''
    return ' '.join([w for w in txt.split() if w.lower() not in sw])


en_nlp = spacy.load('en_core_web_sm')

def clean_text(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)
    txt = remove_stopwords(txt)
    return txt.lower()

def clean_text_vi(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)

    return txt.lower()
def lemma_text(text_line):
    #test_file_clean = []
    doc_spacy = en_nlp(text_line)
    line = [ a.lemma_ for a in doc_spacy]
    #sentence = " ".join(line)
    #test_file_clean.append(sentence)
    return " ".join(line)



vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)

class Checker():
    ## Connect to the MySQL server
    engine = create_engine('mysql://root:kakalot123@localhost:3306/db')
    # book_id =177
    # language = 'en'
    def __init__(self,version,score,test_id):
        self.version = version
        self.score = score
        self.test_id = test_id
        if(self.version =='esv'):
            self.book_id =177
            self.language ='en'
        elif(self.version =='vie2010'):
            self.book_id =397
            self.language ='vi'
        self.train_query = """select verses.id,verses.book_id,verses.chapter,verses.number,verses.text,books.id as books_id,books.name_id,books.name,books.short_name,books.version,books.language from verses  inner join books on verses.book_id = books.id where verses.book_id!={} and books.language='{}' and books.version='{}'""".format(self.book_id,self.language,self.version)
        self.train_model = pd.read_sql_query(self.train_query, self.engine)
        # print(self.test_id)
        # print(self.version)
        # print(self.book_id)
        # print(self.language)
        self.test_query = """select verses.id,verses.book_id,verses.chapter,verses.number,verses.text,books.id as books_id,books.name_id,books.name,books.short_name,books.version,books.language from verses  inner join books on verses.book_id = books.id where verses.book_id={} and books.language='{}' and verses.id ={} and books.version='{}'""".format(self.book_id,self.language,self.test_id,self.version)
        self.test_model = pd.read_sql_query(self.test_query,self.engine).drop(['books_id','name','short_name','version','language'],axis=1)
        self.text = self.train_model['text'].tolist()
        self.verses_id = self.train_model['id'].tolist()
        self.s_vector = list(zip(self.verses_id,self.text))
        self.test_file = self.test_model['text'].apply(str)

    def check_plagiarism(self,train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score
    def show_result(self,train_file,test_file):

        score = self.check_plagiarism(train_file,test_file,vectorizer1)
        if score >float(self.score):
            score = self.check_plagiarism(train_file,test_file,vectorizer2)
        return score



    def check_plagiarism_soft(self,test_file,s_vector):
    #def check_plagiarism_soft(self,test_file):
        self.test_file = test_file
#         self.text = text
#         self.verses_id = verses_id
        self.s_vector = s_vector
        result = {"id":[],"score":[]}

#         if(self.version=='esv'):
#             self.text = [clean_text(txt) for txt in self.text]
#             self.text = [lemma_text(txt) for txt in self.text ]
#             self.test_file = clean_text(self.test_file)
#             self.test_file = lemma_text(self.test_file)
#         elif(self.version =='vie2010'):
#             self.test_file = clean_text_vi(self.test_file)
#         test_file_list =[]
#         for i in range(len(self.text)):
#             test_file_list.append(self.test_file)
#         result['id'] = list(self.verses_id)
#         with concurrent.futures.ProcessPoolExecutor() as executor:
#             score = executor.map(self.show_result,self.text,test_file_list)
#         result['score'] = score

        for a,v_vector in self.s_vector:
            test_file = clean_text(self.test_file)

            if(self.version=='esv'):
                v_vector = clean_text(v_vector)
                test_file = lemma_text(test_file)
                v_vector = lemma_text(v_vector)
            elif(self.version =='vie2010'):
                v_vector = clean_text_vi(v_vector)
            score = self.show_result(v_vector,test_file)
            result['id'].append(a)
                #score.append(executor.submit(self.show_result, train_file=v_vector,test_file=test_file))
            result['score'].append(score)

        #     result['score'] = score
        dt = pd.DataFrame(result)
        result_file =[dt.loc[dt['score']==max(dt['score'])]['id'].tolist()[0],dt.loc[dt['score']==max(dt['score'])]['score'].tolist()[0]]
        return result_file


    def show_final_result(self):
        result = self.check_plagiarism_soft(self.test_file,self.s_vector)
        #result = self.check_plagiarism_soft(self.test_file)
        result_model = pd.read_sql_query(f"select verses.id,verses.book_id,verses.chapter,verses.number,verses.text from verses where verses.id={result[0]}", self.engine)
        result_model['score'] = result[1]
        df = result_model.to_json(orient='table',index=False)
        return df

def main():
    start = time.time()
    #parser = optparse.OptionParser()

    # add options
#     parser.add_option('-v','--version', dest = 'version',
#                       type = 'string',
#                       help = 'choose english or vietnamese version')
#     parser.add_option('-t','--testid', dest = 'testid',
#                       type = 'string',
#                       help = 'please enter the test id')
#     parser.add_option('-s','--score', dest = 'score',
#                       type = 'string',
#                       help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    import cProfile, pstats
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats('cumtime')
    stats.print_stats()

vie2010
0.3
199107
{"schema":{"fields":[{"name":"id","type":"integer"},{"name":"book_id","type":"integer"},{"name":"chapter","type":"integer"},{"name":"number","type":"integer"},{"name":"text","type":"string"},{"name":"score","type":"number"}],"pandas_version":"0.20.0"},"data":[{"id":199539,"book_id":398,"chapter":1,"number":1,"text":"Phao-l\u00f4, b\u1edfi \u00fd \u0111\u1ecbnh c\u1ee7a \u0110\u1ee9c Ch\u00faa Tr\u1eddi, \u0111\u01b0\u1ee3c k\u00eau g\u1ecdi l\u00e0m s\u1ee9 \u0111\u1ed3 c\u1ee7a \u0110\u1ea5ng Christ J\u00easus, c\u00f9ng v\u1edbi S\u1ed1t-then, anh em ch\u00fang ta,","score":1.0}]}
It takes:  54.14648246765137
         69989304 function calls (69805502 primitive calls) in 54.146 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   54.146   54.146 <ipython-input-18-999c0e58d729>:159(main)
        1    0.001    0.001   51.889   51.889 <ipython-input-18-999c0e58d729>:151(show_fina

        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\indexes\range.py:152(_data)
        6    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\log.py:181(instance_logger)
       11    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\indexes\base.py:5969(_maybe_cast_with_dtype)
        3    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\pool\base.py:303(_return_conn)
       34    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\dtypes\common.py:1307(is_float_dtype)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\internals\managers.py:628(astype)
       32    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\generic.py:3409(_set_as_cached)
        6    0.000    0.000    0.000  

In [20]:
def check_plagiarism(train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score

In [24]:
vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)
import time
start_time = time.time()
check_plagiarism( 'including you who are called to belong to Jesus Christ,'
,'as to zeal, a persecutor of the church; as to righteousness, under the law blameless.'
,vectorizer1)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0020012855529785156 seconds ---


In [28]:
def show_result(train_file,test_file):
    score = check_plagiarism(train_file,test_file,vectorizer1)
    if score >0.2:
        score = check_plagiarism(train_file,test_file,vectorizer2)
    return score

        

In [32]:
start_time = time.time()
show_result( 'including you who are called to belong to Jesus Christ,'
,'as to zeal, a persecutor of the church; as to righteousness, under the law blameless.'
)
a = time.time() - start_time
print("--- %s seconds ---" % (a))
print("--- %s seconds ---" % (a *300000))

--- 0.0009992122650146484 seconds ---
--- 299.76367950439453 seconds ---


In [40]:
import cProfile
import pandas as pd
#import sys
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string
import concurrent.futures
import time
import optparse


## Implement the string data

def remove_punctuations(txt, punct = string.punctuation):
    '''
    This function will remove punctuations from the input text
    '''
    return ''.join([c for c in txt if c not in punct])

def remove_stopwords(txt, sw = list(stopwords.words('english'))):
    '''
    This function will remove the stopwords from the input txt
    '''
    return ' '.join([w for w in txt.split() if w.lower() not in sw])


en_nlp = spacy.load('en_core_web_sm')

def clean_text(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)
    txt = remove_stopwords(txt)
    return txt.lower()

def clean_text_vi(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)

    return txt.lower()
def lemma_text(text_line):
    #test_file_clean = []
    doc_spacy = en_nlp(text_line)
    line = [ a.lemma_ for a in doc_spacy]
    #sentence = " ".join(line)
    #test_file_clean.append(sentence)
    return " ".join(line)



vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)

class Checker():
    ## Connect to the MySQL server
    engine = create_engine('mysql://root:kakalot123@localhost:3306/db')
    # book_id =177
    # language = 'en'
    def __init__(self,version,score,test_id):
        self.version = version
        self.score = score
        self.test_id = test_id
        if(self.version =='esv'):
            self.book_id =177
            self.language ='en'
        elif(self.version =='vie2010'):
            self.book_id =397
            self.language ='vi'
        self.train_query = """select verses.id,verses.book_id,verses.chapter,verses.number,verses.text,books.id as books_id,books.name_id,books.name,books.short_name,books.version,books.language from verses  inner join books on verses.book_id = books.id where verses.book_id!={} and books.language='{}' and books.version='{}'""".format(self.book_id,self.language,self.version)
        self.train_model = pd.read_sql_query(self.train_query, self.engine)
        # print(self.test_id)
        # print(self.version)
        # print(self.book_id)
        # print(self.language)
        self.test_query = """select verses.id,verses.book_id,verses.chapter,verses.number,verses.text,books.id as books_id,books.name_id,books.name,books.short_name,books.version,books.language from verses  inner join books on verses.book_id = books.id where verses.book_id={} and books.language='{}' and verses.id ={} and books.version='{}'""".format(self.book_id,self.language,self.test_id,self.version)
        self.test_model = pd.read_sql_query(self.test_query,self.engine).drop(['books_id','name','short_name','version','language'],axis=1)
        self.text = self.train_model['text'].tolist()
        self.verses_id = self.train_model['id'].tolist()
        self.s_vector = list(zip(self.verses_id,self.text))
        self.test_file = self.test_model['text'].apply(str)

    def check_plagiarism(self,train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score
    def show_result(self,train_file,test_file):

        score = self.check_plagiarism(train_file,test_file,vectorizer1)
        if score >float(self.score):
            score = self.check_plagiarism(train_file,test_file,vectorizer2)
        return score



    #def check_plagiarism_soft(self,test_file,s_vector):
    def check_plagiarism_soft(self):
#         self.test_file = test_file
#         self.text = text
#         self.verses_id = verses_id
#         self.s_vector = s_vector
        result = {"id":[],"score":[]}

        if(self.version=='esv'):
            self.text = [clean_text(txt) for txt in self.text]
            self.text = [lemma_text(txt) for txt in self.text ]
            self.test_file = clean_text(self.test_file)
            self.test_file = lemma_text(self.test_file)
        elif(self.version =='vie2010'):
            self.test_file = clean_text_vi(self.test_file)
        test_file_list =[self.test_file for i in range(len(self.text))]
#         for i in range(len(self.text)):
#             test_file_list.append()
        result['id'] = self.verses_id
        #score = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            score= executor.map(self.show_result,self.text,test_file_list)
        result['score'] = score

#         for a,v_vector in self.s_vector:
#             test_file = clean_text(self.test_file)

#             if(self.version=='esv'):
#                 v_vector = clean_text(v_vector)
#                 test_file = lemma_text(test_file)
#                 v_vector = lemma_text(v_vector)
#             elif(self.version =='vie2010'):
#                 v_vector = clean_text_vi(v_vector)
#             score = self.show_result(v_vector,test_file)
#             result['id'].append(a)
#                 #score.append(executor.submit(self.show_result, train_file=v_vector,test_file=test_file))
#             result['score'].append(score)

        #     result['score'] = score
        dt = pd.DataFrame(result)
        result_file =[dt.loc[dt['score']==max(dt['score'])]['id'].tolist()[0],dt.loc[dt['score']==max(dt['score'])]['score'].tolist()[0]]
        return result_file


    def show_final_result(self):
        #result = self.check_plagiarism_soft(self.test_file,self.s_vector)
        result = self.check_plagiarism_soft()
        result_model = pd.read_sql_query(f"select verses.id,verses.book_id,verses.chapter,verses.number,verses.text from verses where verses.id={result[0]}", self.engine)
        result_model['score'] = result[1]
        df = result_model.to_json(orient='table',index=False)
        return df

def main():
    start = time.time()
    #parser = optparse.OptionParser()

    # add options
#     parser.add_option('-v','--version', dest = 'version',
#                       type = 'string',
#                       help = 'choose english or vietnamese version')
#     parser.add_option('-t','--testid', dest = 'testid',
#                       type = 'string',
#                       help = 'please enter the test id')
#     parser.add_option('-s','--score', dest = 'score',
#                       type = 'string',
#                       help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    import cProfile, pstats
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats('cumtime')
    stats.print_stats()

vie2010
0.3
199107
{"schema":{"fields":[{"name":"id","type":"integer"},{"name":"book_id","type":"integer"},{"name":"chapter","type":"integer"},{"name":"number","type":"integer"},{"name":"text","type":"string"},{"name":"score","type":"number"}],"pandas_version":"0.20.0"},"data":[{"id":199539,"book_id":398,"chapter":1,"number":1,"text":"Phao-l\u00f4, b\u1edfi \u00fd \u0111\u1ecbnh c\u1ee7a \u0110\u1ee9c Ch\u00faa Tr\u1eddi, \u0111\u01b0\u1ee3c k\u00eau g\u1ecdi l\u00e0m s\u1ee9 \u0111\u1ed3 c\u1ee7a \u0110\u1ea5ng Christ J\u00easus, c\u00f9ng v\u1edbi S\u1ed1t-then, anh em ch\u00fang ta,","score":1.0}]}
It takes:  36.98989486694336
         1108008 function calls (1107824 primitive calls) in 36.991 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   36.991   36.991 <ipython-input-40-1fc22f8ae98f>:160(main)
        1    0.000    0.000   34.481   34.481 <ipython-input-40-1fc22f8ae98f>:152(show_final_

      103    0.000    0.000    0.000    0.000 {pandas._libs.lib.is_float}
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\event\attr.py:353(__init__)
       11    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\weakref.py:395(__setitem__)
        2    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\numexpr\necompiler.py:535(getContext)
       12    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\ipykernel\iostream.py:310(_is_master_process)
        3    0.000    0.000    0.000    0.000 {built-in method pandas._libs.missing.isnaobj}
        3    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\log.py:63(_should_log_debug)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\engine\result.py:1164(scalar)
        6    0.000    0.000    0.000    0.000 C:\Users\Computer

In [38]:
import cProfile
import pandas as pd
#import sys
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string
import concurrent.futures
import time
import optparse


## Implement the string data

def remove_punctuations(txt, punct = string.punctuation):
    '''
    This function will remove punctuations from the input text
    '''
    return ''.join([c for c in txt if c not in punct])

def remove_stopwords(txt, sw = list(stopwords.words('english'))):
    '''
    This function will remove the stopwords from the input txt
    '''
    return ' '.join([w for w in txt.split() if w.lower() not in sw])


en_nlp = spacy.load('en_core_web_sm')

def clean_text(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)
    txt = remove_stopwords(txt)
    return txt.lower()

def clean_text_vi(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)

    return txt.lower()
def lemma_text(text_line):
    #test_file_clean = []
    doc_spacy = en_nlp(text_line)
    line = [ a.lemma_ for a in doc_spacy]
    #sentence = " ".join(line)
    #test_file_clean.append(sentence)
    return " ".join(line)



vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)

class Checker():
    ## Connect to the MySQL server
    engine = create_engine('mysql://root:kakalot123@localhost:3306/db')
    # book_id =177
    # language = 'en'
    def __init__(self,version,score,test_id):
        self.version = version
        self.score = score
        self.test_id = test_id
        if(self.version =='esv'):
            self.book_id =177
            self.language ='en'
        elif(self.version =='vie2010'):
            self.book_id =397
            self.language ='vi'
        self.train_query = """select verses.id,verses.text from verses  inner join books on verses.book_id = books.id where verses.book_id!={} and books.language='{}' and books.version='{}'""".format(self.book_id,self.language,self.version)
        self.train_model = pd.read_sql_query(self.train_query, self.engine)
        # print(self.test_id)
        # print(self.version)
        # print(self.book_id)
        # print(self.language)
        self.test_query = """select verses.id,verses.text from verses  inner join books on verses.book_id = books.id where verses.book_id={} and books.language='{}' and verses.id ={} and books.version='{}'""".format(self.book_id,self.language,self.test_id,self.version)
        self.test_model = pd.read_sql_query(self.test_query,self.engine)
        self.text = self.train_model['text'].tolist()
        self.verses_id = self.train_model['id'].tolist()
        self.s_vector = list(zip(self.verses_id,self.text))
        self.test_file = self.test_model['text'].apply(str)

    def check_plagiarism(self,train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score
    def show_result(self,train_file,test_file):

        score = self.check_plagiarism(train_file,test_file,vectorizer1)
        if score >float(self.score):
            score = self.check_plagiarism(train_file,test_file,vectorizer2)
        return score



    #def check_plagiarism_soft(self,test_file,s_vector):
    def check_plagiarism_soft(self):
#         self.test_file = test_file
#         self.text = text
#         self.verses_id = verses_id
#         self.s_vector = s_vector
        result = {"id":[],"score":[]}

        if(self.version=='esv'):
            self.text = [clean_text(txt) for txt in self.text]
            self.text = [lemma_text(txt) for txt in self.text ]
            self.test_file = clean_text(self.test_file)
            self.test_file = lemma_text(self.test_file)
        elif(self.version =='vie2010'):
            self.test_file = clean_text_vi(self.test_file)
        test_file_list =[self.test_file for i in range(len(self.text))]
#         for i in range(len(self.text)):
#             test_file_list.append()
        result['id'] = self.verses_id
        #score = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            score= executor.map(self.show_result,self.text, test_file_list)
        result['score'] = score

#         for a,v_vector in self.s_vector:
#             test_file = clean_text(self.test_file)

#             if(self.version=='esv'):
#                 v_vector = clean_text(v_vector)
#                 test_file = lemma_text(test_file)
#                 v_vector = lemma_text(v_vector)
#             elif(self.version =='vie2010'):
#                 v_vector = clean_text_vi(v_vector)
#             score = self.show_result(v_vector,test_file)
#             result['id'].append(a)
#                 #score.append(executor.submit(self.show_result, train_file=v_vector,test_file=test_file))
#             result['score'].append(score)

        #     result['score'] = score
        dt = pd.DataFrame(result)
        result_file =[dt.loc[dt['score']==max(dt['score'])]['id'].tolist()[0],dt.loc[dt['score']==max(dt['score'])]['score'].tolist()[0]]
        return result_file


    def show_final_result(self):
        #result = self.check_plagiarism_soft(self.test_file,self.s_vector)
        result = self.check_plagiarism_soft()
        result_model = pd.read_sql_query(f"select verses.id,verses.book_id,verses.chapter,verses.number,verses.text from verses where verses.id={result[0]}", self.engine)
        result_model['score'] = result[1]
        df = result_model.to_json(orient='table',index=False)
        return df

def main():
    start = time.time()
    #parser = optparse.OptionParser()

    # add options
#     parser.add_option('-v','--version', dest = 'version',
#                       type = 'string',
#                       help = 'choose english or vietnamese version')
#     parser.add_option('-t','--testid', dest = 'testid',
#                       type = 'string',
#                       help = 'please enter the test id')
#     parser.add_option('-s','--score', dest = 'score',
#                       type = 'string',
#                       help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    import cProfile, pstats
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats('cumtime')
    stats.print_stats()

vie2010
0.3
199107
{"schema":{"fields":[{"name":"id","type":"integer"},{"name":"book_id","type":"integer"},{"name":"chapter","type":"integer"},{"name":"number","type":"integer"},{"name":"text","type":"string"},{"name":"score","type":"number"}],"pandas_version":"0.20.0"},"data":[{"id":199539,"book_id":398,"chapter":1,"number":1,"text":"Phao-l\u00f4, b\u1edfi \u00fd \u0111\u1ecbnh c\u1ee7a \u0110\u1ee9c Ch\u00faa Tr\u1eddi, \u0111\u01b0\u1ee3c k\u00eau g\u1ecdi l\u00e0m s\u1ee9 \u0111\u1ed3 c\u1ee7a \u0110\u1ea5ng Christ J\u00easus, c\u00f9ng v\u1edbi S\u1ed1t-then, anh em ch\u00fang ta,","score":1.0}]}
It takes:  35.24558639526367
         1107168 function calls (1107012 primitive calls) in 35.245 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   35.245   35.245 <ipython-input-38-29646a735120>:160(main)
        1    0.000    0.000   33.097   33.097 <ipython-input-38-29646a735120>:152(show_final_

        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\sqlalchemy\sql\compiler.py:489(process)
       28    0.000    0.000    0.000    0.000 {method 'pop' of 'dict' objects}
        4    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\internals\managers.py:1699(<listcomp>)
        1    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\MySQLdb\connections.py:190(<listcomp>)
        7    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\dtypes\common.py:615(is_dtype_equal)
       83    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\internals\blocks.py:265(mgr_locs)
        3    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\typing.py:771(__subclasscheck__)
       52    0.000    0.000    0.000    0.000 C:\Users\Computer\anaconda3\lib\site-packages\pandas\core\internals\blocks.py:3

In [41]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  6


In [None]:
import cProfile
import pandas as pd
#import sys
from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string
import concurrent.futures
import time
import optparse


## Implement the string data

def remove_punctuations(txt, punct = string.punctuation):
    '''
    This function will remove punctuations from the input text
    '''
    return ''.join([c for c in txt if c not in punct])

def remove_stopwords(txt, sw = list(stopwords.words('english'))):
    '''
    This function will remove the stopwords from the input txt
    '''
    return ' '.join([w for w in txt.split() if w.lower() not in sw])


en_nlp = spacy.load('en_core_web_sm')

def clean_text(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)
    txt = remove_stopwords(txt)
    return txt.lower()

def clean_text_vi(txt):
    '''
    This function will clean the text being passed by removing specific line feed characters
    like '\n', '\r', and '\'
    '''

    txt = txt.replace('\n', ' ').replace('\r', ' ').replace('\'', '')
    txt = remove_punctuations(txt)

    return txt.lower()
def lemma_text(text_line):
    #test_file_clean = []
    doc_spacy = en_nlp(text_line)
    line = [ a.lemma_ for a in doc_spacy]
    #sentence = " ".join(line)
    #test_file_clean.append(sentence)
    return " ".join(line)



vectorizer1 = CountVectorizer(ngram_range=(2,4),stop_words="english")
vectorizer2 = CountVectorizer(ngram_range=(2,4),stop_words="english",min_df=0.55)

class Checker():
    ## Connect to the MySQL server
    engine = create_engine('mysql://root:kakalot123@localhost:3306/db')
    # book_id =177
    # language = 'en'
    def __init__(self,version,score,test_id):
        self.version = version
        self.score = score
        self.test_id = test_id
        if(self.version =='esv'):
            self.book_id =177
            self.language ='en'
        elif(self.version =='vie2010'):
            self.book_id =397
            self.language ='vi'
        self.train_query = """select verses.id,verses.text from verses  inner join books on verses.book_id = books.id where verses.book_id!={} and books.language='{}' and books.version='{}'""".format(self.book_id,self.language,self.version)
        self.train_model = pd.read_sql_query(self.train_query, self.engine)
        # print(self.test_id)
        # print(self.version)
        # print(self.book_id)
        # print(self.language)
        self.test_query = """select verses.id,verses.text from verses  inner join books on verses.book_id = books.id where verses.book_id={} and books.language='{}' and verses.id ={} and books.version='{}'""".format(self.book_id,self.language,self.test_id,self.version)
        self.test_model = pd.read_sql_query(self.test_query,self.engine)
        self.text = self.train_model['text'].tolist()
        self.verses_id = self.train_model['id'].tolist()
        self.s_vector = list(zip(self.verses_id,self.text))
        self.test_file = self.test_model['text'].apply(str)

    def check_plagiarism(self,train_file,test_file,vectorizer):
        documents = [train_file,test_file]
        sparse_matrix = vectorizer.fit_transform(documents)
        score = cosine_similarity(sparse_matrix)[0][1]
        return score
    def show_result(self,train_file,test_file):

        score = self.check_plagiarism(train_file,test_file,vectorizer1)
        if score >float(self.score):
            score = self.check_plagiarism(train_file,test_file,vectorizer2)
        return score



    #def check_plagiarism_soft(self,test_file,s_vector):
    def check_plagiarism_soft(self):
#         self.test_file = test_file
#         self.text = text
#         self.verses_id = verses_id
#         self.s_vector = s_vector
        result = {"id":[],"score":[]}

        if(self.version=='esv'):
            self.text = [clean_text(txt) for txt in self.text]
            self.text = [lemma_text(txt) for txt in self.text ]
            self.test_file = clean_text(self.test_file)
            self.test_file = lemma_text(self.test_file)
        elif(self.version =='vie2010'):
            self.test_file = clean_text_vi(self.test_file)
        test_file_list =[self.test_file for i in range(len(self.text))]
#         for i in range(len(self.text)):
#             test_file_list.append()
        result['id'] = self.verses_id
        #score = []
#         with concurrent.futures.ThreadPoolExecutor() as executor:
#             score= executor.map(self.show_result,self.text,test_file_list)
        score = pool.starmap(self.show_result, zip(self.text,test_file_list))
        pool.close()  
        result['score'] = score

#         for a,v_vector in self.s_vector:
#             test_file = clean_text(self.test_file)

#             if(self.version=='esv'):
#                 v_vector = clean_text(v_vector)
#                 test_file = lemma_text(test_file)
#                 v_vector = lemma_text(v_vector)
#             elif(self.version =='vie2010'):
#                 v_vector = clean_text_vi(v_vector)
#             score = self.show_result(v_vector,test_file)
#             result['id'].append(a)
#                 #score.append(executor.submit(self.show_result, train_file=v_vector,test_file=test_file))
#             result['score'].append(score)

        #     result['score'] = score
        dt = pd.DataFrame(result)
        result_file =[dt.loc[dt['score']==max(dt['score'])]['id'].tolist()[0],dt.loc[dt['score']==max(dt['score'])]['score'].tolist()[0]]
        return result_file


    def show_final_result(self):
        #result = self.check_plagiarism_soft(self.test_file,self.s_vector)
        result = self.check_plagiarism_soft()
        result_model = pd.read_sql_query(f"select verses.id,verses.book_id,verses.chapter,verses.number,verses.text from verses where verses.id={result[0]}", self.engine)
        result_model['score'] = result[1]
        df = result_model.to_json(orient='table',index=False)
        return df

def main():
    start = time.time()
    #parser = optparse.OptionParser()

    # add options
#     parser.add_option('-v','--version', dest = 'version',
#                       type = 'string',
#                       help = 'choose english or vietnamese version')
#     parser.add_option('-t','--testid', dest = 'testid',
#                       type = 'string',
#                       help = 'please enter the test id')
#     parser.add_option('-s','--score', dest = 'score',
#                       type = 'string',
#                       help = 'please enter the algorithm score')


#     (options, args) = parser.parse_args()
#     if (options.version == None):
#             print ("Version is not null")
#             exit(0)
#     else:
#             version = options.version

#     if (options.testid == None):
#             print ("Test id is not null")
#             exit(0)
#     else:
#             testid = options.testid

#     if (options.score == None):
#             print ("The score is not null")
#             exit(0)
#     else:
#             score = options.score
    my_checker = Checker('vie2010','0.3','199107')
    print(my_checker.version)
    print(my_checker.score)
    print(my_checker.test_id)
    print(my_checker.show_final_result())

    end = time.time()
    print("It takes: ",end -start)


if __name__ == "__main__":
    import cProfile, pstats
    profiler = cProfile.Profile()
    profiler.enable()
    main()
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats('cumtime')
    stats.print_stats()

vie2010
0.3
199107


In [1]:
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
print(mp.cpu_count())

6
