## NER and Preprocessing
This notebook preprocesses the articles from the LexisNexis API and performs NER for getting other companies mentioned together with the NAATBatt listed ones

### Functions Required
Here, also the Stanza model is downloaded

In [2]:
import pandas as pd
import csv
import json
import stanza
import numpy as np
import os
import glob
import torch
import gc

myLangModel= stanza.Pipeline(lang='en',processors='tokenize,ner')

2024-07-27 02:52:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-27 02:52:34 INFO: Downloaded file to /home/h11922823/stanza_resources/resources.json
2024-07-27 02:52:35 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-07-27 02:52:35 INFO: Using device: cuda
2024-07-27 02:52:35 INFO: Loading: tokenize
2024-07-27 02:52:35 INFO: Loading: mwt
2024-07-27 02:52:35 INFO: Loading: ner
2024-07-27 02:52:35 INFO: Done loading processors!


In [3]:
def cleaning(entity):
    '''
        This function removes from the NERs the company endings.
        It takes as input a company name and returns it cleaned from its suffix.
    '''
    entity = entity.replace('Inc','')
    entity = entity.replace("'s",'')            
    entity = entity.replace('INC','') 
    entity = entity.replace('AG','')
    entity = entity.replace('SA','')
    entity = entity.replace('Corp','')
    entity = entity.replace('BWA','')
    entity = entity.replace('GmBH','')
    entity = entity.replace('LLC','')
    entity = entity.replace('Group','')
    entity = entity.replace('Co ','')
    entity = entity.replace('Ltd','')  
    entity = entity.replace('LP','')
    entity = entity.replace('.','')   
    entity = entity.replace(',','')

    return entity

In [4]:
def filter_values(entity, initial_company):
    '''
        The purpose of this funciton is to remove unwanted NERs from the output
        This function takes as input a NER and the NAATBatt listed company. It 
        compares the NER with a list of exclusions. If all checks are passed, 
        it returns true, otherwise false.
    '''   

    #lists of exclusions
    full_terms = ['EV', 'European Comission', 'SEC', 'WEC', 'Europe', 'European Union', 'United States', 'Gigafactory', 'US Army', 'Q1', 'Q2', 'R&D', 'R & D', 'LFP', 'Green Deal']
    noise_filter = ['show', 'news', 'retirement', 'market', 'wealth', 'advisors', 'assets under', '2024', '2023', '2022', 'press releases', 'sector', 'stock dashboard', 'nyse', 'oem', 'pension', 'pensional', 'free report', 'investment board', 'council', 'government', 'ukri', 'university', 'trojan', 'the ev industry', 'AnalysisEurope']

    #checks if the NER is the NAATBatt listed company
    if initial_company in entity.lower():
        return False

    #checks if a str from noise_filters is present in the name
    for term in noise_filter:
        if term in entity.lower():
            return False
    
    #checks if a value from full_terms is equal with the NER
    for term in full_terms:
        if entity == term:
            return False

    #if all checks are passed return true
    return True

In [5]:
def preprocess_func (initial_company, entities):    
    '''
        This function takes as input NERs for one company listed in
        the NAATBatt database and preprocesses them. It selects only
        organisations and applies the cleaning and filter_values from
        above
    '''
    #keeping only the ORG in the list - not products, cities, countries etc..
    entities = [entity for entity in entities if entity[1] == "ORG"]
    
    #keeping only the name of the entities and not other parameters
    entities = [entity[0] for entity in entities]

    #cleaning the string
    entities = [cleaning(entity) for entity in entities]

    entities = list(filter(lambda entity: filter_values(entity, initial_company), entities))

    #keeping unique results
    entities = list(set(entities))
 
    return entities


In [6]:
def extract_entities(stanza_output):
    import re
    entities = []
    current_entity = []
    current_type = None

    for sentence in stanza_output.sentences:
        for token in sentence.tokens:
            ner_tag = token.ner
            if ner_tag == 'O':
                if current_entity:
                    entities.append((' '.join(current_entity), current_type))
                    current_entity = []
                    current_type = None
            else:
                entity_type = ner_tag.split('-')[1]
                if current_type and current_type != entity_type:
                    entities.append((' '.join(current_entity), current_type))
                    current_entity = []
                current_entity.append(token.text)
                current_type = entity_type
        
        # Append the last entity in the sentence if it exists
        if current_entity:
            current_entity = ' '.join(current_entity)
            tuples = (current_entity, current_type)
            
            entities.append(tuples)
            current_entity = []
            current_type = None

    return entities

In [13]:
def explode_companies (initial_company, data):
    result_text = []
    to_drop = []
    for index,string in enumerate(data.sentences):
        #for logging
        if index % 500 == 0:
            print(index)
            #clear GPU memory
            gc.collect()
            torch.cuda.set_device(1)
            torch.cuda.empty_cache()
            torch.cuda.set_device(0)
            torch.cuda.empty_cache()
        string = string.replace('""', '"')
        
        if string != '{}' and string is not np.nan:
            try:
                dicty = json.loads(string)
            except json.JSONDecodeError as e:
                to_drop.append(index) #will be dropped as the data is corrupted
                continue  # Skip to the next string
            
            pairing = []
            
            for key in dicty.keys():
                text = dicty[key]
    
                res_copy = myLangModel(text)
    
                entities = extract_entities(res_copy)
    
                entities = preprocess_func(
                                        initial_company = initial_company.lower(),
                                        entities = entities)
    
                company_pair = [(entity, initial_company) for entity in entities]
                
                pairing.append((text, company_pair))
            
            result_text.append(pairing)

        else: 
            to_drop.append(index) #if sentences is empty, nothing we can do
            
    
    
    data = data.drop(to_drop, axis =0)
    
    print(initial_company, to_drop, len(result_text), '\n', '\n')
    
    data['prompt_prep'] = result_text

    return data

In [14]:
def format_companies_explosion (data):
    #first keeping unique sentences where we have more detected companies  
    dicty = {}
    for item in data.prompt_prep:
        for sub_item in item:
            key = sub_item[0]
            value = sub_item[1]
    
            if key in dicty:
                dicty[key].append(value)
            else:
                dicty[key] = [value] 
    for key in dicty.keys():
        values = dicty[key]
        flat_values = [item for sublist in values for item in sublist]
        dicty[key] = flat_values

    prompts_list = []
    for key in dicty.keys():
        for tuple in dicty[key]:
            if tuple != []:
                return_value = (key, tuple[1], tuple[0])
                prompts_list.append(return_value)
    
    return prompts_list
    

In [15]:
def run_loop (path, folder):
    print(path)
    data = pd.read_csv(path, 
                       sep = ";",
                       encoding='utf-8',
                       usecols=['title','source','doc_id', 'date', 'sentences'],
                       on_bad_lines = 'skip',
                       engine = 'pyarrow')        
    
    initial_company = folder
    
    result = explode_companies(initial_company,data)

    prompts_list = format_companies_explosion(result)
    print(prompts_list)
    with open('data/'+folder+'/'+folder+'_prompt_ready.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['text', 'initial_company', 'new_company'])  # Write header
        writer.writerows(prompts_list) 
    print('\n')

In [16]:
# Loop through all subfolders in the data folder
data_folder_path ='data/' 
for subfolder in os.listdir(data_folder_path):
    subfolder_path = os.path.join(data_folder_path, subfolder)
    
    # Check if it is a directory
    if os.path.isdir(subfolder_path):
        # Find the CSV file ending with ews.csv in the current subfolder
        csv_files = glob.glob(os.path.join(subfolder_path, '*ews.csv'))
        
        # There should be exactly one CSV file per subfolder as per the given structure
        if csv_files:
            csv_file_path = csv_files[0]  # Get the path of the CSV file
            subfolder_name = os.path.basename(subfolder_path)  # Get the name of the subfolder
            initial_company = subfolder_name.replace('_', '')
            # Print the subfolder name and the first few rows of the dataframe as an example
            try:
                run_loop(path = csv_file_path, 
                         folder = subfolder_name)
            except Exception as e:
                print(e)
                continue
    break

data/Rolled__Ribbon_Battery/Rolled__Ribbon_Battery_0_news.csv
Column 'doc_id' in include_columns does not exist in CSV file
data/Tenergy/Tenergy_0_news.csv
0
name 'sentences' is not defined
data/Morgan_Advanced_Materials/Morgan_Advanced_Materials_1000_news.csv
0
name 'sentences' is not defined
data/Nanoramic_Laboratories/Nanoramic_Laboratories_0_news.csv
0
name 'sentences' is not defined
data/American_Battery_Factory/American_Battery_Factory_100_news.csv
0
name 'sentences' is not defined
data/Calogy_Solutions/Calogy_Solutions_0_news.csv
Column 'doc_id' in include_columns does not exist in CSV file
data/Trojan_Battery/Trojan_Battery_0_news.csv
0
name 'sentences' is not defined
data/Enersys/Enersys_500_news.csv
0
name 'sentences' is not defined
data/BMZ_USA/BMZ_USA_0_news.csv
Column 'doc_id' in include_columns does not exist in CSV file
data/LION_Electric/LION_Electric_700_news.csv
0
name 'sentences' is not defined
data/Amphenol/Amphenol_350_news.csv
0
name 'sentences' is not defined
dat