In [3]:

import time
import spacy
import stanza
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
import os
from nltk.tokenize import sent_tokenize


In [4]:
df_summary = pd.read_csv('final_version_cropped_first1000.csv', index_col = 0)
df_summary.head()

Unnamed: 0.1,Unnamed: 0,pdf_link,summary,text_extracted
0,0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...
1,1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
2,2,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission announc...,IN THE UNITED STATES DISTRICT COURT FOR THE NO...
3,3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...
4,4,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...


In [3]:
def split_text(text:str)->list:
    """
    Split text into sentences
    Args:
        text: the text to be split

    Returns:
        a list of sentences
    """
    sentence_list = sent_tokenize(text)
    return sentence_list

In [4]:
# create a text for first 10 rows
split_text(df_summary['summary'][0])

['CORRECTEDThe Securities and Exchange Commission today charged the former President and CEO of Santa Clara, Calif. technology company Genesis Microchip, Inc. with insider trading.',
 "The Commission alleges that Elias Antoun, who resides in San Jose, bought Genesis stock in the brokerage accounts of a relative and a friend while in the midst of confidential merger negotiations with STMicroelectronics, one of the world's largest semiconductor companies.The SEC also charged Antoun's childhood friend, Samir Abed of Thousand Oaks, who purchased Genesis stock and options after learning of the merger negotiations from Antoun.",
 "Both Antoun and Abed, who netted profits of approximately $33,975 and $51,206, respectively, when the merger was announced, agreed to settle the SEC's charges without admitting or denying the Commission's allegations.The Commission's complaint, filed in federal district court in San Jose, alleges that Genesis, a supplier of image processors for flat-panel TVs and m

In [9]:
for i in range(10,11):
    splited_summary = split_text(df_summary['summary'][i])
    with open(f'falsify_data/summary_{i}.txt', 'w') as f:
        for sentence in splited_summary:
            f.write(sentence + '\n\n')

In [18]:
for i in range(10):
    splited_summary = split_text(df_summary['summary'][i])
    print(round(len(splited_summary)))

11
19
11
9
9
9
11
12
8
15


In [10]:
# select first 10 rows
df_summary = df_summary.iloc[:11,:]
df_summary

Unnamed: 0.1,Unnamed: 0,pdf_link,summary,text_extracted
0,0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...
1,1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
2,2,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission announc...,IN THE UNITED STATES DISTRICT COURT FOR THE NO...
3,3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...
4,4,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...
5,5,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...
6,6,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...
7,7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...
8,8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...
9,9,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...


In [12]:
df_summary.drop(columns = ['Unnamed: 0'], inplace = True)
df_summary

Unnamed: 0,pdf_link,summary,text_extracted
0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...
1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
2,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission announc...,IN THE UNITED STATES DISTRICT COURT FOR THE NO...
3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...
4,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...
5,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...
6,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...
7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...
8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...
9,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...


In [13]:
# drop third row
df_summary.drop(index = 2, inplace = True)
df_summary

Unnamed: 0,pdf_link,summary,text_extracted
0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...
1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...
4,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...
5,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...
6,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...
7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...
8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...
9,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...
10,https://www.sec.gov//litigation/complaints/200...,"On October 23, 2008, the United States Securit...",Robert Long \nAttorney for Plaintiff \nU.S. Se...


In [14]:
df_summary.reset_index(drop = True, inplace = True)
df_summary

Unnamed: 0,pdf_link,summary,text_extracted
0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...
1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
2,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...
3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...
4,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...
5,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...
6,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...
7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...
8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...
9,https://www.sec.gov//litigation/complaints/200...,"On October 23, 2008, the United States Securit...",Robert Long \nAttorney for Plaintiff \nU.S. Se...


In [27]:
falsified_index=[
    [2,5,6,8,9], 
    [0,1,2,4,5,6,8,12,13,18], 
    [1,2,5,6,8,9], 
    [0,1,4,7,8], 
    [0,1,3,4,6,7], 
    [0,1,3,6,7], 
    [0,1,2,3,4,5], 
    [6,7,8,9,10,11], 
    [1,2,3,4,5], 
    [7,8,9,10,11,12,13,14]
]

In [23]:
falsified_summary_list = []
for i in range(10):
    with open(f'falsify_data/summary_{i}.txt', 'r') as f:
        falsified_summary = f.read()
        falsified_summary_list.append(falsified_summary)
falsified_summary_list
    
        

["CORRECTEDThe Securities and Exchange Commission today charged the former President and CEO of Santa Clara, Calif. technology company Genesis Microchip, Inc. with insider trading. The Commission alleges that Elias Antoun, who resides in San Jose, bought Genesis stock in the brokerage accounts of a relative and a friend while in the midst of confidential merger negotiations with STMicroelectronics, one of the world's largest semiconductor companies.The SEC also charged Antoun's childhood friend, Samir Abed of Thousand Oaks, who purchased Genesis stock and options after learning of the merger negotiations from Antoun. Both Antoun and Abed, who netted profits of approximately $43,870 and $81,296, respectively, when the merger was announced, agreed to settle the SEC's charges without admitting or denying the Commission's allegations.The Commission's complaint, filed in federal district court in Los Angeles, alleges that Genesis, a supplier of image processors for flat-panel TVs and monito

In [24]:
df_summary['falsified_summary'] = falsified_summary_list
df_summary

Unnamed: 0,pdf_link,summary,text_extracted,falsified_summary
0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...,CORRECTEDThe Securities and Exchange Commissio...
1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE...",The Canadian Securities and Exchange Commissio...
2,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...,"The Securities and Exchange Commission (""Commi..."
3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...,"The Securities and Exchange Commission (""Commi..."
4,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...,The Securities and Exchange Commission today f...
5,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...,"On October 30, the Securities and Exchange Com..."
6,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...,The Federal Trade Commission launched an unres...
7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...,The Securities and Exchange Commission today c...
8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...,The Securities and Exchange Commission today c...
9,https://www.sec.gov//litigation/complaints/200...,"On October 23, 2008, the United States Securit...",Robert Long \nAttorney for Plaintiff \nU.S. Se...,"On October 23, 2008, the United States Securit..."


In [28]:
df_summary['falsified_index'] = falsified_index
df_summary

Unnamed: 0,pdf_link,summary,text_extracted,falsified_summary,falsified_index
0,https://www.sec.gov//litigation/complaints/200...,CORRECTEDThe Securities and Exchange Commissio...,TRACY L. DAVIS (Cal. Bar No. 184129) Attorne...,CORRECTEDThe Securities and Exchange Commissio...,"[2, 5, 6, 8, 9]"
1,https://www.sec.gov//litigation/complaints/200...,The United States Securities and Exchange Comm...,"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE...",The Canadian Securities and Exchange Commissio...,"[0, 1, 2, 4, 5, 6, 8, 12, 13, 18]"
2,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",2006 SEP 30 AN 8: 24 \nU.S: COURT MIBDLE GISTR...,"The Securities and Exchange Commission (""Commi...","[1, 2, 5, 6, 8, 9]"
3,https://www.sec.gov//litigation/complaints/200...,"The Securities and Exchange Commission (""Commi...",IN THE UNITED STATES DISTRICT COURT FOR THE MI...,"The Securities and Exchange Commission (""Commi...","[0, 1, 4, 7, 8]"
4,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today f...,08-61524-CIV-DIMITROULEAS/ROSENBAUM \nUNITED S...,The Securities and Exchange Commission today f...,"[0, 1, 3, 4, 6, 7]"
5,https://www.sec.gov//litigation/complaints/200...,"On September 30, the Securities and Exchange C...",IN THE UNITED STATES DISTRICT COURT FOR THE EA...,"On October 30, the Securities and Exchange Com...","[0, 1, 3, 6, 7]"
6,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission filed a...,UNITED STATES DISTRICT COURT DISTRICT OF MASSA...,The Federal Trade Commission launched an unres...,"[0, 1, 2, 3, 4, 5]"
7,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,Scott L. Black (Bar Number 514792) \nAttorney ...,The Securities and Exchange Commission today c...,"[6, 7, 8, 9, 10, 11]"
8,https://www.sec.gov//litigation/complaints/200...,The Securities and Exchange Commission today c...,oOo OD DH FP WYN \n= er SO \nJOHN M. McCOY III...,The Securities and Exchange Commission today c...,"[1, 2, 3, 4, 5]"
9,https://www.sec.gov//litigation/complaints/200...,"On October 23, 2008, the United States Securit...",Robert Long \nAttorney for Plaintiff \nU.S. Se...,"On October 23, 2008, the United States Securit...","[7, 8, 9, 10, 11, 12, 13, 14]"


In [29]:
df_summary.to_csv('falsified_summary.csv')

In [36]:
with open('falsify_data/summary_1/summary_1_0.txt', 'w') as f:
    f.write(df_summary['summary'][1])

In [39]:
falsified_index = [
    [],[0,1,2,3,4],[0,1,2,4,5,6,8,12,13,18],[1,2,3,4,5,6,7,8,9,10,11,12,13,14],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
]

In [37]:
falsified_summary_list_1 = []
for i in range(5):
    with open(f'falsify_data/summary_1/summary_1_{i}.txt', 'r') as f:
        falsified_summary = f.read()
        falsified_summary_list_1.append(falsified_summary)
falsified_summary_list_1

['The United States Securities and Exchange Commission announced that on December 29, 2008, it filed an emergency action to halt a Ponzi scheme and affinity fraud conducted by Creative Capital Consortium, LLC and A Creative Capital Concept$, LLC (collectively, Creative Capital), and its principal, George L. Theodule. According to the Commission\'s complaint, the defendants raised at least $23.4 million from thousands of investors in the Haitian-American community nationwide through a network of purported investment clubs Theodule directs investors to form. Also on December 29, 2008 Judge Donald M. Middlebrooks, U.S. District Judge for the Southern District of Florida, issued an order placing Creative Capital under the control of a receiver to safeguard assets, as well as other emergency orders, including temporary restraining orders and asset freezes.The Commission\'s complaint alleges that starting in at least November 2007, Theodule, directly and through Creative Capital, raised at l

In [40]:
falsified_summary_1 = pd.DataFrame({'falsified_summary':falsified_summary_list_1, 'falsified_index':falsified_index})
falsified_summary_1

Unnamed: 0,falsified_summary,falsified_index
0,The United States Securities and Exchange Comm...,[]
1,The United Kingdom Financial Conduct Authority...,"[0, 1, 2, 3, 4]"
2,The Canadian Securities and Exchange Commissio...,"[0, 1, 2, 4, 5, 6, 8, 12, 13, 18]"
3,The United States Securities and Exchange Comm...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]"
4,The Canadian Financial Conduct Authority annou...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [41]:
# add orginal text
falsified_summary_1['text_extracted'] = df_summary['text_extracted'][1]
falsified_summary_1

Unnamed: 0,falsified_summary,falsified_index,text_extracted
0,The United States Securities and Exchange Comm...,[],"ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
1,The United Kingdom Financial Conduct Authority...,"[0, 1, 2, 3, 4]","ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
2,The Canadian Securities and Exchange Commissio...,"[0, 1, 2, 4, 5, 6, 8, 12, 13, 18]","ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
3,The United States Securities and Exchange Comm...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."
4,The Canadian Financial Conduct Authority annou...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","ELECTRONIC \nDEC 29, 2008 \nSTEVEN M, LARIMORE..."


In [None]:
falsified_summary_1.to_csv('falsified_summary_1.csv')