# DSAA 5002 - Data Mining and Knowledge Discovery in Data Science
---

# Task 1 (50 marks) Data Preprocessing and Analysis

**Background:** 
**Assuming you are a sentiment analyst at a securities firm, your task is to assess the impact of each news article on the A-share listed companies explicitly mentioned.**

# Q1. Data Preprocessing - Noise Removal
---

## 1. Search by Rules of Name Trigger Words for Training domain set
- Generate a "training domain set" (i.e., a "completely accurate set of relevancy") using rules under the tightest boundary.
- Record the relevant stock names in each news-row.

### 1.1. Trigger Words

In [2]:
import json
import jieba
import re

In [3]:
def create_company_lookup(company_list):
    # Use a set deduplicating the items
    company_set = set()
    for company in company_list:
        company_set.add(company["name"])
        
    print("The number of company name: {}".format(len(company_list)))
    print("The number of company name to be searched: {}".format(len(company_set)))
    return company_set

# Create the Trigger Words Set of A_share Company name
with open("News_input\\A_share_list\\new_A_share_list.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)
company_set = create_company_lookup(a_share_list)


The number of company name: 4654
The number of company name to be searched: 4625


In [4]:
def find_duplicate_companies(company_list):
    seen = set()
    duplicate_companies = set()

    for company in company_list:
        company_name = company["name"]
        if company_name in seen:
            duplicate_companies.add(company_name)
        else:
            seen.add(company_name)

    return duplicate_companies

# Call this function to find duplicate company names
duplicate_companies = find_duplicate_companies(a_share_list)

print("Duplicate company names:", duplicate_companies)
print("Number of duplicate company names:", len(duplicate_companies))


Duplicate company names: {'震安科技', '宏达新材', '爱迪尔', '科融环境', '东方明珠', '建研集团', '创业黑马', '金通灵', '厚普股份', '航发控制', '银泰资源', '风语筑', '中铁工业', '退市博元', '慈铭体检', '奇信股份', '当代明诚', '新黄浦', '恒林股份', '吉宏股份', '奥赛康', '一心堂', '华峰超纤', '三丰智能', '贤丰控股', '康达新材', '长江润发', '贝因美'}
Number of duplicate company names: 28


### 1.2. Noise Removal for Training domain set

In [4]:
from concurrent.futures import ThreadPoolExecutor
# from tqdm import tqdm
import pandas as pd
import threading

# Create a global lock to ensure thread safety
lock = threading.Lock()

# Function to process a single row of data
def process_row(row, company_set, result_list, drop_list):
    title = row["Title"]
    news_content = row["NewsContent"]
   
    if pd.notna(title) and pd.notna(news_content):      
        news = title + news_content
        if any(company in news for company in company_set):
            # Combine the four columns of this row into one
            combined_row = {
                "NewsID": row["NewsID"],
                "Title": title,
                "NewsContent": news_content,
                "NewsSource": row["NewsSource"],
            }
            with lock:
                result_list.append(combined_row)  # Add the row that meets the condition to the result list
        else:
            # Combine the four columns of this row into one
            combined_row = {
                "NewsID": row["NewsID"],
                "Title": title,
                "NewsContent": news_content,
                "NewsSource": row["NewsSource"]
            }
            with lock:
                drop_list.append(combined_row)  # Add the row that meets the condition to the drop list

# Main processing function
def process_data(start, end, full_data, company_set, result_list, drop_list):
    for i in range(start, end):
        if i % 1000 == 0:
            print("{} rows have been done".format(i))
            print("--- ")
        process_row(full_data.iloc[i], company_set, result_list, drop_list)

In [5]:
# Read an Excel file
full_data = pd.read_excel("News_input\\News.xlsx")

In [6]:
# Splitting data into chunks
num_threads = 21  # Specify the number of threads
chunk_size = len(full_data) // num_threads

threads = []
result_list = []
drop_list= []
print("Processing data with {} threads...".format(num_threads))

for i in range(num_threads):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < num_threads - 1 else len(full_data)
    thread = threading.Thread(target=process_data, args=(start, end, full_data, company_set, result_list, drop_list))
    threads.append(thread)
    print("Thread {} is processing rows {} to {}...".format(i + 1, start, end))
    
for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

print("All threads have finished processing.\n")

print("Before selected by Rule, number of news is: {}".format(len(full_data)))
print("After selected by Rule, number of news is: {}".format(len(result_list)))
print("Droped by Rule, number of news is: {}".format(len(drop_list)))

# Merging results
result_df = pd.concat([pd.DataFrame(result_list)])
drop_df = pd.concat([pd.DataFrame(drop_list)])


Processing data with 21 threads...
Thread 1 is processing rows 0 to 49382...
Thread 2 is processing rows 49382 to 98764...
Thread 3 is processing rows 98764 to 148146...
Thread 4 is processing rows 148146 to 197528...
Thread 5 is processing rows 197528 to 246910...
Thread 6 is processing rows 246910 to 296292...
Thread 7 is processing rows 296292 to 345674...
Thread 8 is processing rows 345674 to 395056...
Thread 9 is processing rows 395056 to 444438...
Thread 10 is processing rows 444438 to 493820...
Thread 11 is processing rows 493820 to 543202...
Thread 12 is processing rows 543202 to 592584...
Thread 13 is processing rows 592584 to 641966...
Thread 14 is processing rows 641966 to 691348...
Thread 15 is processing rows 691348 to 740730...
Thread 16 is processing rows 740730 to 790112...
Thread 17 is processing rows 790112 to 839494...
Thread 18 is processing rows 839494 to 888876...
Thread 19 is processing rows 888876 to 938258...
Thread 20 is processing rows 938258 to 987640...
Thr

307000 rows have been done
--- 
702000 rows have been done
--- 
554000 rows have been done
--- 
949000 rows have been done
--- 
11000 rows have been done
--- 
406000 rows have been done
--- 
801000 rows have been done
--- 
653000 rows have been done
--- 
110000 rows have been done
--- 
258000 rows have been done
--- 
505000 rows have been done
--- 
900000 rows have been done
--- 
209000 rows have been done
--- 
357000 rows have been done
--- 
752000 rows have been done
--- 
604000 rows have been done
--- 
999000 rows have been done
--- 
61000 rows have been done
--- 
456000 rows have been done
--- 
851000 rows have been done
--- 
160000 rows have been done
--- 
308000 rows have been done
--- 
703000 rows have been done
--- 
555000 rows have been done
--- 
950000 rows have been done
--- 
12000 rows have been done
--- 
407000 rows have been done
--- 
802000 rows have been done
--- 
654000 rows have been done
--- 
111000 rows have been done
--- 
259000 rows have been done
--- 
506000 rows

418000 rows have been done
--- 
813000 rows have been done
--- 
122000 rows have been done
--- 
665000 rows have been done
--- 
517000 rows have been done
--- 
270000 rows have been done
--- 
912000 rows have been done
--- 
221000 rows have been done
--- 
369000 rows have been done
--- 
764000 rows have been done
--- 
1011000 rows have been done
--- 
616000 rows have been done
--- 
73000 rows have been done
--- 
468000 rows have been done
--- 
172000 rows have been done
--- 
863000 rows have been done
--- 
320000 rows have been done
--- 
715000 rows have been done
--- 
567000 rows have been done
--- 
962000 rows have been done
--- 
24000 rows have been done
--- 
419000 rows have been done
--- 
814000 rows have been done
--- 
123000 rows have been done
--- 
666000 rows have been done
--- 
518000 rows have been done
--- 
271000 rows have been done
--- 
913000 rows have been done
--- 
222000 rows have been done
--- 
370000 rows have been done
--- 
765000 rows have been done
--- 
1012000 r

282000 rows have been done
--- 
924000 rows have been done
--- 
233000 rows have been done
--- 
381000 rows have been done
--- 
776000 rows have been done
--- 
1023000 rows have been done
--- 
628000 rows have been done
--- 
85000 rows have been done
--- 
184000 rows have been done
--- 
480000 rows have been done
--- 
875000 rows have been done
--- 
727000 rows have been done
--- 
332000 rows have been done
--- 
579000 rows have been done
--- 
974000 rows have been done
--- 
36000 rows have been done
--- 
431000 rows have been done
--- 
826000 rows have been done
--- 
135000 rows have been done
--- 
678000 rows have been done
--- 
530000 rows have been done
--- 
283000 rows have been done
--- 
925000 rows have been done
--- 
234000 rows have been done
--- 
382000 rows have been done
--- 
777000 rows have been done
--- 
1024000 rows have been done
--- 
629000 rows have been done
--- 
86000 rows have been done
--- 
185000 rows have been done
--- 
481000 rows have been done
--- 
876000 ro

640000 rows have been done
--- 
1035000 rows have been done
--- 
97000 rows have been done
--- 
196000 rows have been done
--- 
492000 rows have been done
--- 
887000 rows have been done
--- 
739000 rows have been done
--- 
344000 rows have been done
--- 
591000 rows have been done
--- 
986000 rows have been done
--- 
48000 rows have been done
--- 
147000 rows have been done
--- 
443000 rows have been done
--- 
838000 rows have been done
--- 
690000 rows have been done
--- 
542000 rows have been done
--- 
295000 rows have been done
--- 
937000 rows have been done
--- 
246000 rows have been done
--- 
789000 rows have been done
--- 
394000 rows have been done
--- 
641000 rows have been done
--- 
1036000 rows have been done
--- 
98000 rows have been done
--- 
197000 rows have been done
--- 
493000 rows have been done
--- 
888000 rows have been done
--- 
740000 rows have been done
--- 
345000 rows have been done
--- 
592000 rows have been done
--- 
987000 rows have been done
--- 
49000 row

In [7]:
# Save the filtered data to new Excel files
result_df.to_excel("News_output\\News_training_domain_set_withoutCompany.xlsx", index=False)
drop_df.to_excel("News_input\\News_2nd_filting_set.xlsx", index=False)

### 1.3. Apply explicit_company to each news inTraining domain set

In [20]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import threading
import json
import jieba
import re

# Define Trie dictionary tree node
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False
        self.company_name = None  # Store the company name

# Create a Trie dictionary tree
def build_trie(company_list):
    root = TrieNode()
    for company in company_list:
        node = root
        name = company["name"]
        for char in name:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.company_name = name  # Store the company name
    return root

# Function to extract companies from text
def extract_companies_from_text(text, trie_root):
    companies = set()

    def search_trie(text, node):
        for i in range(len(text)):
            char = text[i]
            if char in node.children:
                node = node.children[char]
                if node.is_end_of_word:
                    companies.add(node.company_name)
            else:
                break

    for i in range(len(text)):
        search_trie(text[i:], trie_root)

    return companies

In [21]:
# Read the JSON file containing company information
with open("News_input\\A_share_list\\new_A_share_list.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)

# Build a trie (prefix tree) using the data from the JSON file (assuming 'build_trie()' function is defined elsewhere)
a_share_trie = build_trie(a_share_list)


In [25]:
# Updated extraction_row function
def extraction_row(row, trie_root):
    title = row["Title"]
    news_content = row["NewsContent"]
    news = title + news_content

    matched_company_names = extract_companies_from_text(news, trie_root)

    # Convert the matched company names set to a comma-separated string
    matched_company_names_str = ", ".join(matched_company_names)

    return {
        "NewsID": row["NewsID"],
        "Title": title,
        "NewsContent": news_content,
        "NewsSource": row["NewsSource"],
        "Explicit_Company": matched_company_names_str  # Extracted matched company names
    }

# Main processing function
def extraction_data(start, end, full_data, trie_root, extraction_list):
    for i in range(start, end):
        if i % 100 == 0:
            print("--- {} rows have been done ---".format(i))
        combined_row = extraction_row(full_data.iloc[i], trie_root)
        with lock:
            extraction_list.append(combined_row)


In [29]:
withoutCompany_df = pd.read_excel("News_output\\News_training_domain_set_withoutCompany.xlsx")

In [32]:
# Create a global lock to ensure thread safety
lock = threading.Lock()

# Split data into chunks
num_threads = 20  # Specify the number of threads
chunk_size = len(withoutCompany_df) // num_threads

threads = []
extraction_list = []
print("Processing data with {} threads...".format(num_threads))

for i in range(num_threads):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < num_threads - 1 else len(withoutCompany_df)
    thread = threading.Thread(target=extraction_data, args=(start, end, withoutCompany_df, a_share_trie, extraction_list))
    threads.append(thread)
    print("Thread {} is processing rows {} to {}...".format(i + 1, start, end))
    
for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

print("All threads have finished processing.\n")

# Merge the results
extraction_df = pd.concat([pd.DataFrame(extraction_list)])


Processing data with 20 threads...
Thread 1 is processing rows 0 to 23762...
Thread 2 is processing rows 23762 to 47524...
Thread 3 is processing rows 47524 to 71286...
Thread 4 is processing rows 71286 to 95048...
Thread 5 is processing rows 95048 to 118810...
Thread 6 is processing rows 118810 to 142572...
Thread 7 is processing rows 142572 to 166334...
Thread 8 is processing rows 166334 to 190096...
Thread 9 is processing rows 190096 to 213858...
Thread 10 is processing rows 213858 to 237620...
Thread 11 is processing rows 237620 to 261382...
Thread 12 is processing rows 261382 to 285144...
Thread 13 is processing rows 285144 to 308906...
Thread 14 is processing rows 308906 to 332668...
Thread 15 is processing rows 332668 to 356430...
Thread 16 is processing rows 356430 to 380192...
Thread 17 is processing rows 380192 to 403954...
Thread 18 is processing rows 403954 to 427716...
Thread 19 is processing rows 427716 to 451478...
Thread 20 is processing rows 451478 to 475242...
--- 0 r

--- 191100 rows have been done ---
--- 167500 rows have been done ---
--- 404900 rows have been done ---
--- 286100 rows have been done ---
--- 49000 rows have been done ---
--- 1300 rows have been done ---
--- 357400 rows have been done ---
--- 25100 rows have been done ---
--- 428700 rows have been done ---
--- 120000 rows have been done ---
--- 262400 rows have been done ---
--- 309900 rows have been done ---
--- 72500 rows have been done ---
--- 381200 rows have been done ---
--- 214900 rows have been done ------ 238700 rows have been done ---

--- 452500 rows have been done ---
--- 143800 rows have been done ---
--- 96300 rows have been done ---
--- 333700 rows have been done ---
--- 191200 rows have been done ---
--- 167600 rows have been done ---
--- 405000 rows have been done ---
--- 286200 rows have been done ---
--- 49100 rows have been done ---
--- 1400 rows have been done ---
--- 357500 rows have been done ---
--- 25200 rows have been done ---
--- 428800 rows have been done

--- 334800 rows have been done ---
--- 192300 rows have been done ---
--- 168700 rows have been done ---
--- 406100 rows have been done ---
--- 287300 rows have been done ---
--- 2500 rows have been done ---
--- 50200 rows have been done ---
--- 358600 rows have been done ---
--- 429900 rows have been done ---
--- 26300 rows have been done ---
--- 121200 rows have been done ---
--- 263600 rows have been done ---
--- 311100 rows have been done ---
--- 73700 rows have been done ---
--- 382400 rows have been done ---
--- 239900 rows have been done ---
--- 216100 rows have been done ---
--- 453700 rows have been done ---
--- 145000 rows have been done ---
--- 97500 rows have been done ---
--- 334900 rows have been done ---
--- 192400 rows have been done ---
--- 168800 rows have been done ---
--- 406200 rows have been done ---
--- 287400 rows have been done ---
--- 2600 rows have been done ---
--- 50300 rows have been done ---
--- 358700 rows have been done ---
--- 430000 rows have been don

--- 336000 rows have been done ---
--- 193500 rows have been done ---
--- 169900 rows have been done ---
--- 407300 rows have been done ---
--- 288500 rows have been done ---
--- 51400 rows have been done ---
--- 3700 rows have been done ---
--- 359800 rows have been done ---
--- 431100 rows have been done ---
--- 27500 rows have been done ---
--- 122400 rows have been done ---
--- 312300 rows have been done ---
--- 264800 rows have been done ---
--- 74900 rows have been done ---
--- 383600 rows have been done ---
--- 241100 rows have been done ------ 217300 rows have been done ---

--- 146200 rows have been done ---
--- 454900 rows have been done ---
--- 98700 rows have been done ---
--- 336100 rows have been done ---
--- 193600 rows have been done ---
--- 170000 rows have been done ---
--- 407400 rows have been done ---
--- 288600 rows have been done ---
--- 51500 rows have been done ---
--- 3800 rows have been done ---
--- 359900 rows have been done ---
--- 431200 rows have been don

--- 337200 rows have been done ---
--- 194700 rows have been done ---
--- 171100 rows have been done ---
--- 408500 rows have been done ---
--- 289700 rows have been done ---
--- 52600 rows have been done ---
--- 4900 rows have been done ---
--- 361000 rows have been done ---
--- 432300 rows have been done ------ 28700 rows have been done ---

--- 123600 rows have been done ---
--- 313500 rows have been done ---
--- 266000 rows have been done ---
--- 76100 rows have been done ---
--- 384800 rows have been done ---
--- 218500 rows have been done ---
--- 242300 rows have been done ---
--- 147400 rows have been done ---
--- 456100 rows have been done ---
--- 99900 rows have been done ---
--- 337300 rows have been done ---
--- 194800 rows have been done ---
--- 171200 rows have been done ---
--- 408600 rows have been done ---
--- 52700 rows have been done ---
--- 289800 rows have been done ---
--- 5000 rows have been done ---
--- 361100 rows have been done ---
--- 28800 rows have been done

--- 338400 rows have been done ---
--- 195900 rows have been done ---
--- 172300 rows have been done ---
--- 409700 rows have been done ---
--- 53800 rows have been done ------ 290900 rows have been done ---

--- 6100 rows have been done ---
--- 362200 rows have been done ---
--- 433500 rows have been done ------ 29900 rows have been done ---

--- 124800 rows have been done ---
--- 314700 rows have been done ---
--- 267200 rows have been done ---
--- 77300 rows have been done ---
--- 386000 rows have been done ---
--- 243500 rows have been done ---
--- 219700 rows have been done ---
--- 148600 rows have been done ---
--- 457300 rows have been done ---
--- 101100 rows have been done ---
--- 338500 rows have been done ---
--- 196000 rows have been done ---
--- 172400 rows have been done ---
--- 409800 rows have been done ---
--- 291000 rows have been done ------ 53900 rows have been done ---

--- 6200 rows have been done ---
--- 362300 rows have been done ---
--- 30000 rows have been don

--- 339600 rows have been done ---
--- 197100 rows have been done ---
--- 173500 rows have been done ---
--- 410900 rows have been done ---
--- 55000 rows have been done ------ 292100 rows have been done ---

--- 7300 rows have been done ---
--- 363400 rows have been done ---
--- 434700 rows have been done ------ 31100 rows have been done ---

--- 126000 rows have been done ---
--- 315900 rows have been done ---
--- 268400 rows have been done ---
--- 78500 rows have been done ---
--- 387200 rows have been done ---
--- 220900 rows have been done ---
--- 244700 rows have been done ---
--- 149800 rows have been done ---
--- 458500 rows have been done ---
--- 102300 rows have been done ---
--- 339700 rows have been done ---
--- 197200 rows have been done ---
--- 173600 rows have been done ---
--- 411000 rows have been done ---
--- 292200 rows have been done ------ 55100 rows have been done ---

--- 7400 rows have been done ---
--- 363500 rows have been done ---
--- 31200 rows have been don

--- 459600 rows have been done ---
--- 103400 rows have been done ---
--- 340800 rows have been done ---
--- 198300 rows have been done ---
--- 412100 rows have been done ---
--- 174700 rows have been done ---
--- 293300 rows have been done ------ 56200 rows have been done ---
--- 8500 rows have been done ---

--- 364600 rows have been done ---
--- 435900 rows have been done ------ 32300 rows have been done ---

--- 127200 rows have been done ---
--- 317100 rows have been done ---
--- 269600 rows have been done ---
--- 79700 rows have been done ---
--- 388400 rows have been done ---
--- 245900 rows have been done ---
--- 222100 rows have been done ---
--- 151000 rows have been done ---
--- 459700 rows have been done ---
--- 103500 rows have been done ---
--- 340900 rows have been done ---
--- 198400 rows have been done ---
--- 412200 rows have been done ---
--- 174800 rows have been done ---
--- 56300 rows have been done ------ 8600 rows have been done ---
--- 293400 rows have been don

--- 460800 rows have been done ------ 104600 rows have been done ---

--- 342000 rows have been done ---
--- 199500 rows have been done ---
--- 413300 rows have been done ---
--- 175900 rows have been done ---
--- 9700 rows have been done ---
--- 57400 rows have been done ---
--- 294500 rows have been done ---
--- 365800 rows have been done ---
--- 437100 rows have been done ------ 33500 rows have been done ---

--- 128400 rows have been done ---
--- 318300 rows have been done ---
--- 270800 rows have been done ---
--- 80900 rows have been done ---
--- 389600 rows have been done ---
--- 247100 rows have been done ---
--- 152200 rows have been done ---
--- 223300 rows have been done ---
--- 104700 rows have been done ---
--- 460900 rows have been done ---
--- 342100 rows have been done ---
--- 199600 rows have been done ---
--- 413400 rows have been done ---
--- 176000 rows have been done ---
--- 9800 rows have been done ---
--- 57500 rows have been done ---
--- 294600 rows have been do

--- 248200 rows have been done ---
--- 224400 rows have been done ---
--- 105800 rows have been done ---
--- 462000 rows have been done ---
--- 343200 rows have been done ---
--- 200700 rows have been done ---
--- 414500 rows have been done ------ 177100 rows have been done ---

--- 58600 rows have been done ---
--- 295700 rows have been done ---
--- 10900 rows have been done ---
--- 367000 rows have been done ---
--- 129600 rows have been done ---
--- 438300 rows have been done ---
--- 34700 rows have been done ---
--- 319500 rows have been done ---
--- 272000 rows have been done ---
--- 82100 rows have been done ---
--- 390800 rows have been done ---
--- 153400 rows have been done ---
--- 248300 rows have been done ---
--- 224500 rows have been done ---
--- 105900 rows have been done ---
--- 462100 rows have been done ---
--- 343300 rows have been done ---
--- 200800 rows have been done ---
--- 177200 rows have been done ---
--- 414600 rows have been done ---
--- 58700 rows have been

--- 391900 rows have been done ---
--- 154500 rows have been done ---
--- 249400 rows have been done ---
--- 225600 rows have been done ---
--- 463200 rows have been done ---
--- 107000 rows have been done ---
--- 344400 rows have been done ---
--- 201900 rows have been done ---
--- 178300 rows have been done ---
--- 415700 rows have been done ---
--- 296900 rows have been done ------ 12100 rows have been done ---

--- 59800 rows have been done ---
--- 368200 rows have been done ---
--- 439500 rows have been done ---
--- 320700 rows have been done ---
--- 35900 rows have been done ---
--- 130800 rows have been done ---
--- 273200 rows have been done ---
--- 83300 rows have been done ---
--- 392000 rows have been done ---
--- 154600 rows have been done ---
--- 249500 rows have been done ------ 225700 rows have been done ---

--- 463300 rows have been done ---
--- 107100 rows have been done ---
--- 344500 rows have been done ---
--- 202000 rows have been done ---
--- 178400 rows have bee

--- 393100 rows have been done ---
--- 155700 rows have been done ---
--- 250600 rows have been done ---
--- 226800 rows have been done ---
--- 464400 rows have been done ---
--- 108200 rows have been done ---
--- 345600 rows have been done ---
--- 203100 rows have been done ---
--- 179500 rows have been done ---
--- 416900 rows have been done ---
--- 61000 rows have been done ------ 298100 rows have been done ---
--- 13300 rows have been done ---

--- 369400 rows have been done ---
--- 321900 rows have been done ------ 37100 rows have been done ---
--- 132000 rows have been done ---

--- 440700 rows have been done ---
--- 274400 rows have been done ---
--- 84500 rows have been done ---
--- 393200 rows have been done ---
--- 155800 rows have been done ---
--- 250700 rows have been done ---
--- 226900 rows have been done ---
--- 464500 rows have been done ---
--- 108300 rows have been done ---
--- 345700 rows have been done ---
--- 203200 rows have been done ---
--- 179600 rows have bee

--- 85600 rows have been done ---
--- 394300 rows have been done ---
--- 156900 rows have been done ---
--- 228000 rows have been done ---
--- 251800 rows have been done ---
--- 465600 rows have been done ---
--- 109400 rows have been done ---
--- 346800 rows have been done ---
--- 204300 rows have been done ---
--- 180700 rows have been done ---
--- 418100 rows have been done ---
--- 299300 rows have been done ------ 62200 rows have been done ---
--- 14500 rows have been done ---

--- 370600 rows have been done ---
--- 323100 rows have been done ------ 441900 rows have been done ---
--- 133200 rows have been done ---

--- 38300 rows have been done ---
--- 275600 rows have been done ---
--- 85700 rows have been done ---
--- 394400 rows have been done ---
--- 157000 rows have been done ---
--- 228100 rows have been done ------ 251900 rows have been done ---
--- 465700 rows have been done ---

--- 109500 rows have been done ---
--- 346900 rows have been done ---
--- 204400 rows have been

--- 86800 rows have been done ---
--- 395500 rows have been done ---
--- 158100 rows have been done ---
--- 253000 rows have been done ------ 229200 rows have been done ---
--- 466800 rows have been done ---

--- 110600 rows have been done ---
--- 348000 rows have been done ---
--- 205500 rows have been done ---
--- 181900 rows have been done ---
--- 419300 rows have been done ---
--- 15700 rows have been done ------ 63400 rows have been done ---
--- 300500 rows have been done ---

--- 371800 rows have been done ---
--- 443100 rows have been done ---
--- 39500 rows have been done ---
--- 134400 rows have been done ---
--- 324300 rows have been done ---
--- 276800 rows have been done ---
--- 86900 rows have been done ---
--- 395600 rows have been done ---
--- 158200 rows have been done ---
--- 229300 rows have been done ---
--- 466900 rows have been done ---
--- 253100 rows have been done ---
--- 110700 rows have been done ---
--- 348100 rows have been done ---
--- 205600 rows have been

--- 88000 rows have been done ---
--- 396700 rows have been done ---
--- 159300 rows have been done ---
--- 254200 rows have been done ---
--- 468000 rows have been done ---
--- 230400 rows have been done ---
--- 111800 rows have been done ---
--- 349200 rows have been done ---
--- 420500 rows have been done ------ 206700 rows have been done ---
--- 183100 rows have been done ---

--- 16900 rows have been done ---
--- 64600 rows have been done ---
--- 301700 rows have been done ---
--- 373000 rows have been done ---
--- 444300 rows have been done ------ 135600 rows have been done ---
--- 40700 rows have been done ---

--- 325500 rows have been done ---
--- 278000 rows have been done ---
--- 88100 rows have been done ---
--- 396800 rows have been done ---
--- 159400 rows have been done ---
--- 254300 rows have been done ---
--- 468100 rows have been done ---
--- 230500 rows have been done ---
--- 111900 rows have been done ---
--- 349300 rows have been done ---
--- 206800 rows have been

--- 89200 rows have been done ---
--- 397900 rows have been done ---
--- 160500 rows have been done ---
--- 255400 rows have been done ---
--- 469200 rows have been done ---
--- 231600 rows have been done ---
--- 113000 rows have been done ---
--- 350400 rows have been done ---
--- 184300 rows have been done ---
--- 207900 rows have been done ---
--- 421700 rows have been done ---
--- 18100 rows have been done ---
--- 65800 rows have been done ---
--- 302900 rows have been done ---
--- 374200 rows have been done ---
--- 41900 rows have been done ---
--- 326700 rows have been done ---
--- 136800 rows have been done ---
--- 445500 rows have been done ---
--- 279200 rows have been done ---
--- 89300 rows have been done ---
--- 398000 rows have been done ---
--- 160600 rows have been done ---
--- 255500 rows have been done ---
--- 469300 rows have been done ---
--- 231700 rows have been done ---
--- 113100 rows have been done ---
--- 350500 rows have been done ---
--- 184400 rows have been

--- 90400 rows have been done ---
--- 399100 rows have been done ---
--- 161700 rows have been done ---
--- 256600 rows have been done ---
--- 470400 rows have been done ---
--- 232800 rows have been done ---
--- 114200 rows have been done ---
--- 351600 rows have been done ---
--- 185500 rows have been done ---
--- 209100 rows have been done ---
--- 422900 rows have been done ---
--- 19300 rows have been done ------ 304100 rows have been done ---
--- 67000 rows have been done ---

--- 375400 rows have been done ---
--- 138000 rows have been done ------ 43100 rows have been done ---
--- 327900 rows have been done ---
--- 446700 rows have been done ---

--- 280400 rows have been done ---
--- 90500 rows have been done ---
--- 399200 rows have been done ---
--- 161800 rows have been done ---
--- 256700 rows have been done ---
--- 470500 rows have been done ---
--- 232900 rows have been done ---
--- 114300 rows have been done ---
--- 351700 rows have been done ---
--- 185600 rows have been

--- 91600 rows have been done ---
--- 400300 rows have been done ---
--- 162900 rows have been done ---
--- 257800 rows have been done ---
--- 471600 rows have been done ---
--- 234000 rows have been done ---
--- 115400 rows have been done ---
--- 352800 rows have been done ---
--- 210300 rows have been done ---
--- 186700 rows have been done ---
--- 424100 rows have been done ---
--- 20500 rows have been done ------ 305300 rows have been done ---
--- 68200 rows have been done ---

--- 376600 rows have been done ---
--- 139200 rows have been done ------ 447900 rows have been done ---

--- 44300 rows have been done ---
--- 329100 rows have been done ---
--- 281600 rows have been done ---
--- 91700 rows have been done ---
--- 400400 rows have been done ---
--- 163000 rows have been done ---
--- 257900 rows have been done ---
--- 471700 rows have been done ---
--- 234100 rows have been done ---
--- 115500 rows have been done ---
--- 352900 rows have been done ---
--- 210400 rows have been

--- 92800 rows have been done ---
--- 401500 rows have been done ---
--- 164100 rows have been done ---
--- 259000 rows have been done ---
--- 472800 rows have been done ---
--- 235200 rows have been done ---
--- 116600 rows have been done ---
--- 354000 rows have been done ---
--- 211500 rows have been done ---
--- 187900 rows have been done ---
--- 425300 rows have been done ---
--- 306500 rows have been done ------ 69400 rows have been done ---
--- 21700 rows have been done ---

--- 377800 rows have been done ---
--- 330300 rows have been done ---
--- 449100 rows have been done ---
--- 45500 rows have been done ---
--- 140400 rows have been done ---
--- 282800 rows have been done ---
--- 92900 rows have been done ---
--- 401600 rows have been done ---
--- 164200 rows have been done ---
--- 259100 rows have been done ---
--- 472900 rows have been done ---
--- 235300 rows have been done ---
--- 116700 rows have been done ---
--- 354100 rows have been done ---
--- 211600 rows have been

--- 94000 rows have been done ------ 402700 rows have been done ---

--- 165300 rows have been done ---
--- 260200 rows have been done ---
--- 474000 rows have been done ---
--- 236400 rows have been done ---
--- 117800 rows have been done ---
--- 355200 rows have been done ---
--- 212700 rows have been done ---
--- 189100 rows have been done ---
--- 426500 rows have been done ---
--- 307700 rows have been done ------ 70600 rows have been done ---
--- 22900 rows have been done ---

--- 379000 rows have been done ---
--- 331500 rows have been done ------ 46700 rows have been done ---
--- 450300 rows have been done ---

--- 141600 rows have been done ---
--- 284000 rows have been done ---
--- 402800 rows have been done ------ 94100 rows have been done ---

--- 165400 rows have been done ---
--- 260300 rows have been done ---
--- 474100 rows have been done ---
--- 236500 rows have been done ---
--- 117900 rows have been done ---
--- 355300 rows have been done ---
--- 212800 rows have been

In [33]:
extraction_df.to_excel("News_output\\News_training_domain_set_withCompany.xlsx", index=False)

## 2. Search by Rules of Similarity-judged Trigger Words for Application Domain Set

### 2.1. Abbreviation Generation

In [97]:
import json
import jieba
import re

# Reading the JSON File containing "A-share Company Information"
with open("News_input\\A_share_list\\A_share_list.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)

# String Processing and Abbreviation Generation for All the Company Names
for company in a_share_list:
    # 1. Simplifying the full name
    fullname = company["fullname"]
    if "股份有限公司" in fullname:
        partname = fullname.replace("股份有限公司", "")
    else:
        partname = fullname
    company["partname"] = partname
    
    # 2. Generating Nickname (abbreviation)
    partname = re.sub(r'[(（][^)）]+[)）]', '', partname)
    if len(company["partname"]) <= 3: 
        company["abbreviation"] = company["partname"]
    else:  # Generating Nickname using JIEBA Tokenization when the simplified full name is not simple enough
        words = jieba.lcut(partname)  
        name = re.sub(r'[a-zA-Z@#$%^&*()！~\[\]{};:,.<>?/\\|]', '', company["name"]) 
        if "退市" in name:
            name = name.replace("退市", "")
        if len(words) == 1:
            # If there's only one word, keep abbreviation consistent with the partname
            company["abbreviation"] = partname
        elif len(words[0]) >= 2 and words[0][0] != name[0]:
            # If the first word has at least 2 characters and doesn't match the first letter of the name, remove it
            company["abbreviation"] = "".join(words[1:])  
        elif len(words[0]) == 1 and words[0][0] != name[0]:
            # If the first word has 1 character and doesn't match the first two letters of the name, remove it
            company["abbreviation"] = "".join(words[2:])
        else:
            # Otherwise, assuming all words are keywords, keep the partname unchanged
            partname = "".join(words)
            company["abbreviation"] = partname
    
    # Removing words like "集团", "控股", "科技" from the abbreviation if present
    del_words = ["集团","控股","科技"]
    for del_word in del_words:
        if del_word in company["abbreviation"]:
            abbreviation_correct = company["abbreviation"].replace(del_word, "")
            if len(abbreviation_correct) > 2:
                # If it's less than or equal to 2 characters, don't remove; otherwise, remove
                company["abbreviation"] = abbreviation_correct
                
    # Handling cautiously when the length of abbreviation is less than or equal to 3 resembling common words like "雪人"
    if len(company["abbreviation"]) <= 3:
        # Removing English and special characters from name, removing "股份" from name
        if not re.search(r'[a-zA-Z]', partname):
            name_correct = re.sub(r'[a-zA-Z@#$%^&*()！~\[\]{};:,.<>?/\\|]', '', company["name"])  
        if "股份" in name_correct:
            name_correct = name_correct.replace("股份", "")
        # If name is not contained in abbreviation, be cautious
        if name_correct not in company["abbreviation"]:
            company["abbreviation"] = name_correct
    
    # Outputting the construction of this dictionary entry
    print(f'{company["name"]}: from {company["partname"]} to {company["abbreviation"]}')

# Creating a new JSON file and writing the updated data into it
with open("News_input\\A_share_list\\A_share_list_with_abbreviation.json", "w", encoding="utf-8") as new_file:
    json.dump(a_share_list, new_file, ensure_ascii=False, indent=2)

print("New JSON file created with 'partname' and 'abbreviation' fields.")


邵阳液压:from 邵阳维克液压to邵阳维克液压
同益中:from 北京同益中新材料科技to同益中新材料
华瓷股份:from 湖南华联瓷业to华联瓷业
鸿富瀚:from 深圳市鸿富瀚科技to鸿富瀚
高铁电气:from 中铁高铁电气装备to高铁电气装备
严牌股份:from 浙江严牌过滤技术to严牌过滤技术
百胜智能:from 江西百胜智能科技to百胜智能
青岛食品:from 青岛食品to青岛食品
德昌股份:from 宁波德昌电机to德昌电机
中自科技:from 中自环保科技to中自环保
富吉瑞:from 北京富吉瑞光电科技to富吉瑞光电
新瀚新材:from 江苏新瀚新材料to新瀚新材料
春雪食品:from 春雪食品集团to春雪食品
孩子王:from 孩子王儿童用品to孩子王儿童用品
丽臣实业:from 湖南丽臣实业to丽臣实业
珠海冠宇:from 珠海冠宇电池to珠海冠宇电池
百普赛斯:from 北京百普赛斯生物科技to百普赛斯生物
多瑞医药:from 西藏多瑞医药to多瑞医药
亚康股份:from 北京亚康万玮信息技术to亚康万玮信息技术
凯盛新材:from 山东凯盛新材料to凯盛新材料
中国能建:from 中国能源建设to中国能源建设
大地海洋:from 杭州大地海洋环保to大地海洋环保
华尔泰:from 安徽华尔泰化工to华尔泰化工
中捷精工:from 江苏中捷精工科技to中捷精工
星华反光:from 杭州星华反光材料to星华反光材料
君亭酒店:from 浙江君亭酒店管理to君亭酒店管理
纽威数控:from 纽威数控装备（苏州）to纽威数控装备
上海港湾:from 上海港湾基础建设（集团）to上海港湾基础建设
显盈科技:from 深圳市显盈科技to显盈科技
万事利:from 杭州万事利丝绸文化to万事利丝绸文化
开勒股份:from 开勒环境科技（上海）to开勒环境
力量钻石:from 河南省力量钻石to力量钻石
海锅股份:from 张家港海锅新能源装备to海锅新能源装备
金三江:from 金三江（肇庆）硅材料to金三江硅材料
兰卫医学:from 上海兰卫医学检验所to兰卫医学检验所
匠心家居:from 常州匠心独具智能家居to匠心独具智能家居
禾信仪器:from 广州禾信仪器to禾信仪器
振华新材:from 贵州振华新材料to振华新材料
本立科技:from 浙江本

New JSON file created with 'partname' and 'abbreviation' fields.


### 2.2. Abbreviation Quality Assessment

#### 1) Quality Assessment with Normal condition

In [6]:
# Create the Trigger Words Set of A_share Company name
with open("News_input\\A_share_list\\A_share_list_with_abbreviation.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)
LEN2_abbreviation = []
LEN3_abbreviation = []
LEN4_abbreviation = []
LEN5_abbreviation = []
LEN6_abbreviation = []
LEN7_abbreviation = []
LEN8plus_abbreviation = []

for company in a_share_list:
    if len(company['abbreviation'])==2:
        LEN2_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])==3:
        LEN3_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])==4:
        LEN4_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])==5:
        LEN5_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])==6:
        LEN6_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])==7:
        LEN7_abbreviation.append(company['abbreviation'])
    if len(company['abbreviation'])>=8:
        LEN8plus_abbreviation.append(company['abbreviation'])

        
print(f'Length 2 abbreviation: {len(LEN2_abbreviation)}')
print(f'Length 3 abbreviation: {len(LEN3_abbreviation)}')
print(f'Length 4 abbreviation: {len(LEN4_abbreviation)}')
print(f'Length 5 abbreviation: {len(LEN5_abbreviation)}')
print(f'Length 6 abbreviation: {len(LEN6_abbreviation)}')
print(f'Length 7 abbreviation: {len(LEN7_abbreviation)}')
print(f'Length greater than or equal to 8 abbreviation: {len(LEN8plus_abbreviation)}')

# Manually filter common words that are still in aliases, common words are generally two characters long:
print("\nCommon words are generally two characters long, abbreviation with two characters:")
print(LEN2_abbreviation)

Length 2 abbreviation: 120
Length 3 abbreviation: 192
Length 4 abbreviation: 2405
Length 5 abbreviation: 438
Length 6 abbreviation: 898
Length 7 abbreviation: 247
Length greater than or equal to 8 abbreviation: 354

Common words are generally two characters long, abbreviation with two characters:
['久祺', '新柴', '爱慕', '恒帅', '泰坦', '欣贺', '美畅', '久量', '博深', '舜喆', '中潜', '仙鹤', '威华', '综艺', '大港', '万马', '跃岭', '凯马', '天宸', '拓邦', '雪人', '红相', '开润', '大业', '软控', '华纺', '瀚叶', '金鹰', '仁智', '海亮', '巨化', '闰土', '航民', '兰生', '丰华', '龙头', '申能', '朗姿', '华业', '荣联', '沙钢', '康盛', '龙昌', '哈慈', '龙涤', '瑞德', '锌业', '聚龙', '振静', '尚纬', '川润', '华升', '猴王', '大洲', '九有', '宏业', '中浩', '高乐', '融捷', '兆驰', '兆驰', '特力', '林海', '恒宝', '永鼎', '起步', '信联', '万盛', '银江', '慈星', '海立', '中路', '紫光', '鞍钢', '大商', '辽港', '嘉陵', '锡业', '宏达', '通威', '银亿', '节能', '诚志', '赣能', '光电', '椰岛', '润建', '福达', '华帝', '青松', '安妮', '建发', '鸿博', '永新', '朗源', '仙坛', '东港', '丽鹏', '汉缆', '地矿', '九阳', '万润', '歌尔', '胜利', '联发', '弘业', '杉杉', '波导', '永高', '金固', '华鑫', '申达', '亚通', '世茂', '三孚', '坊展', '河钢',

In [17]:
# Observe and obtain common words, manually remove
general_words_list = ['爱慕', '仙鹤', '综艺', '雪人', '大洲', '起步', '锡业', '光电', '地矿', '胜利']
print(general_words_list)

['爱慕', '仙鹤', '综艺', '雪人', '大洲', '起步', '锡业', '光电', '地矿', '胜利']


In [23]:
def is_high_quality_with_normal_condition(company_name,company_abbreviation,general_words_list): 
    normal_condition_1 = company_abbreviation not in general_words_list # 如果abbreviation是常用词，则不对本条生成别名对应字典条目
    normal_condition_2 = company_name not in company_abbreviation # 如果name包含在abbreviateviation里面，则不对本条生成别名对应字典条目
    normal_condition_3 = "退市" not in company_name # 如果name里有退市，则不对本条生成别名对应字典条目
    return normal_condition_1 and normal_condition_2 and normal_condition_3

#### 2) Quality Assessment Using Similarity

In [7]:
# Create the Trigger Words Set of A_share Company name
with open("News_input\\A_share_list\\A_share_list_with_abbreviation.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)

In [8]:
company_names = [company["name"] for company in a_share_list]

In [9]:
company_abbreviation = [company["abbreviation"] for company in a_share_list]

In [11]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
from tqdm import tqdm

# Load BERT model and tokenizer
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Convert stock names and abbreviations to embeddings
company_names = [company["name"] for company in a_share_list]
company_abbreviations = [company["abbreviation"] for company in a_share_list]

# Define a function to compute embeddings
def get_embedding(text):
    tokens = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(tokens)
        last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.mean(dim=1).squeeze().numpy()

# Obtain embeddings for stock names and abbreviations
print("Name Embeddings Generation Begins")
name_embeddings = []
for i in tqdm(range(len(company_names))):
    name_embedding_i = get_embedding(company_names[i])
    name_embeddings.append(name_embedding_i)

print("Abbreviation Embeddings Generation Begins")
abbreviation_embeddings = []
for i in tqdm(range(len(company_names))):
    abbreviations_embedding_i = get_embedding(company_abbreviations[i])
    abbreviation_embeddings.append(abbreviations_embedding_i)

Name Embeddings Generation Begins


100%|██████████████████████████████████████████████████████████████████████████████| 4654/4654 [02:40<00:00, 28.96it/s]


Abbreviation Embeddings Generation Begins


100%|██████████████████████████████████████████████████████████████████████████████| 4654/4654 [02:56<00:00, 26.37it/s]


In [21]:
# Calculate similarity
similarity_dict = {}
for i, name_embedding in enumerate(name_embeddings):
    abbr_embedding = abbreviation_embeddings[i]
    # Similarity Calculation
    similarity = cosine_similarity([name_embedding], [abbr_embedding])[0][0]
    similarity_dict[company_names[i]] = similarity

In [13]:
def is_high_quality_with_similarity_condition(company_name,similarity_dict): 
    similarity_dict[company_name] > 0.7
    return True

#### 3) Generate High-Quality Abbreviation Using the Above Two Judgment Methods

In [24]:
import json

# Read in the JSON File of "A_share Company Information"
with open("News_input\\A_share_list\\A_share_list_with_abbreviation.json", "r", encoding="utf-8") as file:
    a_share_list = json.load(file)

# Generate the second lecel filter dictionary
company_dict = {}
for company in a_share_list:
    company_name = company["name"]
    company_abbreviation = company["abbreviation"]
    if is_high_quality_with_similarity_condition(company_name,similarity_dict) and is_high_quality_with_normal_condition(company_name,company_abbreviation,general_words_list):
        company_dict[company_name] = company_abbreviation

In [25]:
print(f"Size of the second lecel filter dictionary is {len(company_dict)}")

Size of the second lecel filter dictionary is 1574


In [26]:
# Prevent duplicates
def find_duplicate_companies(company_dict):
    seen = set()
    duplicate_companies = set()
    for company_name, company_abbreviation in company_dict.items():
        if company_name in seen:
            duplicate_companies.add(company_name)
        else:
            seen.add(company_name)

    return duplicate_companies

duplicate_companies = find_duplicate_companies(company_dict)
print("Number of duplicate company names:", len(duplicate_companies))

Number of duplicate company names: 0


In [112]:
# Invert the dictionary for easier mapping to display explicit_company
inverted_company_dict = {v: k for k, v in company_dict.items()}

### 2.3. Second Level filter

In [111]:
company_set = set()
for company_name, company_abbreviation in company_dict.items():
    company_set.add(company_abbreviation)

print("The number of company name: {}".format(len(company_set)))
print("The number of company name to be searched: {}".format(len(company_set)))

The number of company name: 1568
The number of company name to be searched: 1568


In [114]:
from concurrent.futures import ThreadPoolExecutor
# from tqdm import tqdm
import pandas as pd
import threading

# Create a global lock for ensuring thread safety
lock = threading.Lock()

# Function to process a row of data
def process_row(row, company_set, result_list, drop_list):
    title = row["Title"]
    news_content = row["NewsContent"]
   
    if pd.notna(title) and pd.notna(news_content):      
        news = title + news_content
        if any(company in news for company in company_set):
            # Combine the four columns of this row into one
            combined_row = {
                "NewsID": row["NewsID"],
                "Title": title,
                "NewsContent": news_content,
                "NewsSource": row["NewsSource"],
            }
            with lock:
                result_list.append(combined_row)  # Add rows meeting the condition to the result list
        else:
            # Combine the four columns of this row into one
            combined_row = {
                "NewsID": row["NewsID"],
                "Title": title,
                "NewsContent": news_content,
                "NewsSource": row["NewsSource"]
            }
            with lock:
                drop_list.append(combined_row)  # Add rows meeting the condition to the result list

# Main processing function
def process_data(start, end, full_data, company_set, result_list, drop_list):
    for i in range(start, end):
        if i % 1000 == 0:
            print("{} rows have been done".format(i))
            print("--- ")
        process_row(full_data.iloc[i], company_set, result_list, drop_list)

In [115]:
# read Excel file News_2nd_filting_set
full_data = pd.read_excel("News_input\\News_2nd_filting_set.xlsx")

In [116]:
# Splitting the data into chunks
num_threads = 21  # Specifying the number of threads
chunk_size = len(full_data) // num_threads

threads = []
result_list = []
drop_list = []
print("Processing data with {} threads...".format(num_threads))

for i in range(num_threads):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < num_threads - 1 else len(full_data)
    thread = threading.Thread(target=process_data, args=(start, end, full_data, company_set, result_list, drop_list))
    threads.append(thread)
    print("Thread {} is processing rows {} to {}...".format(i + 1, start, end))
    
for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

print("All threads have finished processing.\n")

print("Before selection by Rule, number of news is: {}".format(len(full_data)))
print("After selection by Rule, number of news is: {}".format(len(result_list)))
print("Dropped by Rule, number of news is: {}".format(len(drop_list)))

# Merging the results
result_df = pd.concat([pd.DataFrame(result_list)])
drop_df = pd.concat([pd.DataFrame(drop_list)])

Processing data with 21 threads...
Thread 1 is processing rows 0 to 26732...
Thread 2 is processing rows 26732 to 53464...
Thread 3 is processing rows 53464 to 80196...
Thread 4 is processing rows 80196 to 106928...
Thread 5 is processing rows 106928 to 133660...
Thread 6 is processing rows 133660 to 160392...
Thread 7 is processing rows 160392 to 187124...
Thread 8 is processing rows 187124 to 213856...
Thread 9 is processing rows 213856 to 240588...
Thread 10 is processing rows 240588 to 267320...
Thread 11 is processing rows 267320 to 294052...
Thread 12 is processing rows 294052 to 320784...
Thread 13 is processing rows 320784 to 347516...
Thread 14 is processing rows 347516 to 374248...
Thread 15 is processing rows 374248 to 400980...
Thread 16 is processing rows 400980 to 427712...
Thread 17 is processing rows 427712 to 454444...
Thread 18 is processing rows 454444 to 481176...
Thread 19 is processing rows 481176 to 507908...
Thread 20 is processing rows 507908 to 534640...
Threa

171000 rows have been done
--- 
518000 rows have been done
--- 
65000 rows have been done
--- 
438000 rows have been done
--- 
278000 rows have been done
--- 
545000 rows have been done
--- 
119000 rows have been done
--- 
38000 rows have been done
--- 
199000 rows have been done
--- 
358000 rows have been done
--- 
305000 rows have been done
--- 
465000 rows have been done
--- 
145000 rows have been done
--- 
385000 rows have been done
--- 
492000 rows have been done
--- 
332000 rows have been done
--- 
226000 rows have been done
--- 
93000 rows have been done
--- 
252000 rows have been done
--- 
412000 rows have been done
--- 
13000 rows have been done
--- 
172000 rows have been done
--- 
519000 rows have been done
--- 
66000 rows have been done
--- 
439000 rows have been done
--- 
279000 rows have been done
--- 
546000 rows have been done
--- 
120000 rows have been done
--- 
39000 rows have been done
--- 
200000 rows have been done
--- 
359000 rows have been done
--- 
306000 rows ha

131000 rows have been done
--- 
50000 rows have been done
--- 
211000 rows have been done
--- 
370000 rows have been done
--- 
317000 rows have been done
--- 
477000 rows have been done
--- 
157000 rows have been done
--- 
397000 rows have been done
--- 
504000 rows have been done
--- 
344000 rows have been done
--- 
238000 rows have been done
--- 
105000 rows have been done
--- 
264000 rows have been done
--- 
424000 rows have been done
--- 
25000 rows have been done
--- 
184000 rows have been done
--- 
531000 rows have been done
--- 
78000 rows have been done
--- 
451000 rows have been done
--- 
291000 rows have been done
--- 
558000 rows have been done
--- 
132000 rows have been done
--- 
51000 rows have been done
--- 
212000 rows have been done
--- 
371000 rows have been done
--- 
318000 rows have been done
--- 
478000 rows have been done
--- 
158000 rows have been done
--- 
398000 rows have been done
--- 
345000 rows have been done
--- 
505000 rows have been done
--- 
239000 rows 

In [117]:
result_df.to_excel("News_output\\News_application_domain_set_part1_withoutCompany.xlsx", index=False)
drop_df.to_excel("News_input\\News_3nd_filting_set.xlsx", index=False)

### 2.4. Apply explicit_company to each news in Application domain set

In [118]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import threading
import json
import jieba
import re

# Define Trie dictionary tree node
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False
        self.company_name = None  # Store the company name

# Create a Trie dictionary tree
def build_trie(company_dict):
    root = TrieNode()
    for key,company_abbreviation in company_dict.items():
        node = root
        name = company_abbreviation
        for char in name:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.company_name = name  # Store the company name
    return root

# Function to extract companies from text
def extract_companies_from_text(text, trie_root, inverted_company_dict):
    companies = set()
    def search_trie(text, node, inverted_company_dict):
        for i in range(len(text)):
            char = text[i]
            if char in node.children:
                node = node.children[char]
                if node.is_end_of_word:
                    companies.add(inverted_company_dict[node.company_name])
            else:
                break


    for i in range(len(text)):
        search_trie(text[i:], trie_root, inverted_company_dict)

    return companies


In [119]:
# Create a Trie dictionary tree
a_share_trie = build_trie(company_dict)

In [120]:
# Updated extraction_row function
def extraction_row(row, trie_root, inverted_company_dict):
    title = row["Title"]
    news_content = row["NewsContent"]
    news = title + news_content

    matched_company_names = extract_companies_from_text(news, trie_root, inverted_company_dict)

    # Converting the matched company names set into a comma-separated string
    matched_company_names_str = ", ".join(matched_company_names)

    return {
        "NewsID": row["NewsID"],
        "Title": title,
        "NewsContent": news_content,
        "NewsSource": row["NewsSource"],
        "Explicit_Company": matched_company_names_str  # Extracted matched company names
    }

# Main processing function
def extraction_data(start, end, full_data, trie_root, extraction_list, inverted_company_dict):
    for i in range(start, end):
        if i % 100 == 0:
            print("--- {} rows have been done ---".format(i))
        combined_row = extraction_row(full_data.iloc[i], trie_root, inverted_company_dict)
        with lock:
            extraction_list.append(combined_row)


In [121]:
withoutCompany_df = pd.read_excel("News_output\\News_application_domain_set_part1_withoutCompany.xlsx")

In [122]:
# Create a global lock for ensuring thread safety
lock = threading.Lock()
# Splitting data chunks
num_threads = 20  # Specify the number of threads
chunk_size = len(withoutCompany_df) // num_threads

threads = []
extraction_list = []
print("Processing data with {} threads...".format(num_threads))

for i in range(num_threads):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < num_threads - 1 else len(withoutCompany_df)
    thread = threading.Thread(target=extraction_data, args=(start, end, withoutCompany_df, a_share_trie, extraction_list, inverted_company_dict))
    threads.append(thread)
    print("Thread {} is processing rows {} to {}...".format(i + 1, start, end))
    
for thread in threads:
    thread.start()

for thread in threads:
    thread.join()

print("All threads have finished processing.\n")

# Merge results
extraction_df = pd.concat([pd.DataFrame(extraction_list)])


Processing data with 20 threads...
Thread 1 is processing rows 0 to 2449...
Thread 2 is processing rows 2449 to 4898...
Thread 3 is processing rows 4898 to 7347...
Thread 4 is processing rows 7347 to 9796...
Thread 5 is processing rows 9796 to 12245...
Thread 6 is processing rows 12245 to 14694...
Thread 7 is processing rows 14694 to 17143...
Thread 8 is processing rows 17143 to 19592...
Thread 9 is processing rows 19592 to 22041...
Thread 10 is processing rows 22041 to 24490...
Thread 11 is processing rows 24490 to 26939...
Thread 12 is processing rows 26939 to 29388...
Thread 13 is processing rows 29388 to 31837...
Thread 14 is processing rows 31837 to 34286...
Thread 15 is processing rows 34286 to 36735...
Thread 16 is processing rows 36735 to 39184...
Thread 17 is processing rows 39184 to 41633...
Thread 18 is processing rows 41633 to 44082...
Thread 19 is processing rows 44082 to 46531...
Thread 20 is processing rows 46531 to 48984...
--- 0 rows have been done ---
--- 2500 rows ha

--- 16200 rows have been done ---
--- 22900 rows have been done ---
--- 32500 rows have been done ---
--- 27700 rows have been done ---
--- 37400 rows have been done ---
--- 13600 rows have been done ---
--- 42300 rows have been done ---
--- 47200 rows have been done ---
--- 20600 rows have been done ---
--- 7000 rows have been done ---
--- 1600 rows have been done ---
--- 25400 rows have been done ---
--- 18500 rows have been done ---
--- 11200 rows have been done ---
--- 9600 rows have been done ---
--- 4100 rows have been done ---
--- 30100 rows have been done ---
--- 35000 rows have been done ---
--- 39900 rows have been done ---
--- 44800 rows have been done ---
--- 16300 rows have been done ---
--- 23000 rows have been done ---
--- 32600 rows have been done ---
--- 27800 rows have been done ---
--- 37500 rows have been done ---
--- 13700 rows have been done ---
--- 42400 rows have been done ---
--- 47300 rows have been done ---
--- 20700 rows have been done ---
--- 7100 rows have

--- 36400 rows have been done ------ 31500 rows have been done ---

--- 41300 rows have been done ---
--- 46200 rows have been done ---
--- 24400 rows have been done ---
--- 34000 rows have been done ---
--- 29200 rows have been done ---
--- 38900 rows have been done ---
--- 43800 rows have been done ---
--- 48700 rows have been done ---
--- 26900 rows have been done ---
--- 31600 rows have been done ------ 36500 rows have been done ---

--- 41400 rows have been done ---
--- 46300 rows have been done ---
--- 34100 rows have been done ---
--- 29300 rows have been done ---
--- 39000 rows have been done ---
--- 43900 rows have been done ---
--- 48800 rows have been done ---
--- 36600 rows have been done ------ 31700 rows have been done ---

--- 41500 rows have been done ---
--- 46400 rows have been done ---
--- 34200 rows have been done ---
--- 39100 rows have been done ---
--- 44000 rows have been done ---
--- 48900 rows have been done ---
--- 31800 rows have been done ---
--- 36700 rows

In [123]:
extraction_df.to_excel("News_output\\News_application_domain_set_part1_withCompany.xlsx", index=False)

In [2]:
import pandas as pd

# Merge News_application_domain_set_part1_withCompany.xlsx and News_training_domain_set_withCompany.xlsx
# Read the first Excel file
df_part1 = pd.read_excel("News_output\\News_application_domain_set_part1_withCompany.xlsx")

# Read the second Excel file
df_part2 = pd.read_excel("News_output\\News_training_domain_set_withCompany.xlsx")

# Concatenate the two DataFrames
df_combined = pd.concat([df_part1, df_part2])

# Sort the combined DataFrame by the "NewsID" column in ascending order
df_combined_sorted = df_combined.sort_values("NewsID", ascending=True)

# Write the merged DataFrame into a new Excel file
df_combined_sorted.to_excel("News_output\\News_application_domain_set_withCompany.xlsx", index=False)


### 2.5.filter_rate

In [128]:
# Read the Qrigin file
df_total_news = pd.read_excel("News_input\\News.xlsx")
# Read the final file of task1 Q1
df_filtered_news= pd.read_excel("News_output\\News_application_domain_set_withCompany.xlsx")

filter_rate = len(df_filtered_news)/len(df_total_news)

In [129]:
print(f"Number of total news: {len(df_total_news)}")
print(f"Number of filtered news: {len(df_filtered_news)}")
print(f'Filter rate: {filter_rate:.4f}')

Number of total news: 1037035
Number of filtered news: 524226
Filter rate: 0.5055


---

## 3.[Unacceptible] Search by Similarity Slides Window for A Boader Application Domain Set

In [None]:
# import pandas as pd
# import jieba
# from transformers import BertTokenizer, BertModel
# import torch
# import numpy as np

# # Load the Chinese BERT model
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
# model = BertModel.from_pretrained("bert-base-chinese")

# # Get embeddings for company names
# company_names = [company["name"] for company in a_share_list] 
# name_embeddings = {}
# for company_name in company_names:
#     inputs = tokenizer(company_name, return_tensors="pt", padding=True, truncation=True)
#     outputs = model(**inputs)
#     name_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
#     name_embeddings[company_name] = name_embedding

# # Get embeddings for generated abbreviations
# company_abbreviations = [company["abbreviation"] for company in a_share_list]
# abbreviation_embeddings = {}
# for company_abbreviation in company_abbreviations:
#     inputs = tokenizer(company_abbreviation, return_tensors="pt", padding=True, truncation=True)
#     outputs = model(**inputs)
#     abbreviation_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
#     abbreviation_embeddings[company_name] = abbreviation_embedding    
    
    
# # Define similarity threshold
# similarity_threshold = 0.8


# # Iterate through each news item
# for index, row in news_data.iterrows():
#     # Get news title and content
#     title = row["Title"]
#     news_content = row["NewsContent"]

#     # Concatenate title and content
#     text = title + " " + news_content

#     # Tokenize the text
#     words = jieba.cut(text)

#     # Join the tokenized words into a string
#     text = " ".join(words)

#     # Sliding window processing
#     window_size = 5  # Adjust window size as needed
#     word_list = text.split()
#     for i in range(len(word_list) - window_size + 1):
#         window_text = " ".join(word_list[i:i+window_size])
        
#         # Calculate similarity between the text window and company names
#         max_similarity = 0
#         related_company = None
#         for company_name, company_embedding in company_embeddings.items():
#             inputs = tokenizer(window_text, return_tensors="pt", padding=True, truncation=True)
#             outputs = model(**inputs)
#             window_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
            
#             similarity = np.dot(company_embedding, window_embedding.T).max()

#             if similarity > similarity_threshold and similarity > max_similarity:
#                 max_similarity = similarity
#                 related_company = company_name

#         if related_company is not None:
#             related_words.append(related_company)
#         else:
#             related_words.append(None)

# # Add the column of related words to the data
# news_data["RelatedCompany"] = related_words

# # Save the filtered data to a new Excel file
# news_data.to_excel("NewsWithRelatedCompany.xlsx", index=False)
