## Inferring Results with the OpenAI GPT 3.5

### Prerequisits

In [123]:
import pandas as pd
import os
import glob
import re
import openai
import tiktoken

### Prompts

In [179]:
prompt_template = """<article> {text} </article>
<company A> {comp_initial} </company A>
<company B> {comp_new} </comapny B>
"""

In [178]:
system_instruction = """ You are a classifier that analyzes a given excerpt of a news article (delimited with XML tags) and extracts the relationship between two companies: company A and B (delimited with XML tags). The answer will only be based on the information provided in the excerpt. You reply with only one word, which labels the type of interaction between the two companies. 

The labels you can output are:
- "supplier": if company A supplies products to company B and is part of company A's supply chain.
- "buyer": if company A is buying any parts in its production from company B.
- "merger": if companies A and B are involved in a mergers & acquisitions transaction.
- "partnership": if company A and company B are collaborating in any other way
- "nothing": if there is no connection between company A and company B from the excerpt (for example they appear together in an enumeration)

Few shot examples:
<examples>
<user>
<article> General Motors' (GM) GM Ventures joined in an $11 million Series A funding round in North Carolina lithium-metal battery maker Soelect </article>
<company A> Soelect </company A> <company B> GM </comapny B>
</user>
<assistant> merger </assistant>
<user>
<article> Ford Motor Company and Changan Automobile recently announced their commitment to strengthening their strategic cooperation and will innovate the business models and increase cooperation efficiency. </article>
<company A> Ford Motor </company A> <company B> Changan Automobile </comapny B>
</user>
<assistant> partnership </assistant>
<user>
<article> The South Korean battery maker signed an agreement with Akasol, the leading manufacturer of high-performance lithium-ion battery systems in Frankfurt for two orders for global commercial vehicles. Under the agreement, Samsung SDI will supply its battery cells and modules to the German. </article>
<company A> Akasol </company A> <company B> Samsung SDI </comapny B>
</user>
<assistant> buyer </assistant>
<user>
<article> Key Players Profiled in the study includes:- Cabot,Cytec Solvay,HEG,Hexcel,Mersen S.A,Mitsubishi Rayon,Morgan Advanced Materials,SEC Carbon,IBIDEN,GrafTechCataloging the Competitive Terrain of the Carbon & Graphite Market </article>
<company A> Morgan Advanced Materials </company A> <company B> Mitsubishi </comapny B>
</user>
<assistant> nothing </assistant>
</examples>
"""

#### Obtaining a list of Automotive OEMs

In [176]:
import sparql_dataframe

# Define the SPARQL endpoint
endpoint = "https://query.wikidata.org/sparql"

# Define the SPARQL query to obtain OEM car manufacturers
query = """
SELECT ?manufacturer ?manufacturerLabel ?revenue WHERE {
  ?manufacturer wdt:P31 wd:Q786820;  # Instance of car manufacturer
                wdt:P2139 ?revenue.   # Revenue property
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
ORDER BY DESC(?revenue)
LIMIT 300
"""

#  wdt:P31 - Instance of car manufacturer, and Q786820 of car label
# wdt:P2139 - Revenue Property
# Run the query and convert the result to a pandas DataFrame
df = sparql_dataframe.get(endpoint, query, post=True)

In [170]:
#manually filtering the car manufacturers obtained from Wikidata
list_car_manufacturers = [
    'Volkswagen', 'VW', 'VW Group', 'Audi', 'Skoda', 'Seat', 'Cupra', 'KTM', 'Puch', 
    'Nova', 'Nova Bus', 'BRP', 'Campagna', 'Lion Electric', 'BYD', 'Geely', 'Great Wall', 'NIO', 
    'Xpeng', 'SAIC', 'Changan', 'FAW', 'Hongqi', 'Trumpchi', 'GAC', 'Chery', 'Li Auto', 'Rimac', 
    'Tatra', 'Zenvo', 'Electric Raceabout', 'Aixam', 'Alpine', 'DS', 'Citroen', 'Citroën', 'Bugatti', 
    'Peugeot', 'PSA', 'Renault', 'Renault Trucks', 'Alpina', 'BMW', 'Daimler', 'Mercedes', 
    'Mercedes-Benz', 'Opel', 'RUF', 'Porsche', 'Smart', 'Maybach', 'Ashok Leyland', 'Bajaj', 
    'Eicher', 'Hero MotoCorp', 'Mahindra', 'Maruti Suzuki', 'SML Isuzu', 'Tata', 'Tata Motors', 
    'TVS', 'Atul Auto', 'Hindustan', 'ICML', 'Omega Seiki Mobility', 'KAL', 'Esemka', 'Pindad', 
    'Abarth', 'Alfa Romeo', 'Ferrari', 'Fiat', 'Stellantis', 'Lancia', 'Lamborghini', 'Maserati', 
    'Pagani', 'Piaggio', 'Acura', 'Daihatsu', 'Honda', 'Infiniti', 'Isuzu', 'Lexus', 'Mazda', 
    'Mitsubishi', 'Mitsubishi Motors', 'Nissan', 'Subaru', 'Suzuki', 'Toyota', 'Spyker', 'Solaris', 
    'Arrinera', 'Dacia', 'Aurus', 'GAZ', 'Daewoo', 'Hyundai', 'KIA', 'Koenigsegg', 'Polestar', 
    'Volvo', 'Volvo Cars', 'Saab', 'Otokar', 'Aston Martin', 'Bentley', 'Jaguar', 'Lagonda', 
    'Land Rover', 'Range Rover', 'Lotus', 'MINI', 'Morgan', 'Rolls-Royce', 'Rolls Royce', 'McLaren', 
    'TVR', 'Vauxhall', 'Ford', 'GM', 'Chevrolet', 'Cadillac', 'Dodge', 'Jeep', 'Buick', 'GMC', 
    'Chrysler', 'Lincoln', 'Hennessey', 'Shelby', 'SRT', 'Tesla', 'Rivian', 'Lucid Motors', 
    'Fisker', 'Faraday', 'Nikola Motor', 'Canoo', 'BrightDrop', 'Polaris', 'Aptera'
]

144

#### Function for selecting only OEMs from the companies identified with NER

In [125]:
def select_oems (data, list_car_manufacturers):

    '''
        This function takes as input a dataframe parsed from the
        ner_stanza_preproc notebook and the list of car manufacturers 
        retrieved from WikiData and afterwards manually cleaned. Using
        these two, it makes sure to only keep from the named entities
        those whose names hint towards an OEM. The function returns the
        dataset filtered for only the rows where the retrieved named entity
        is an OEM (the NAATBatt companies stay as they are).
    '''
    
    import re
    #some values are NAs and we cannot use them
    data.dropna(axis = 0, inplace = True)
    data['new_company'] = data['new_company'].astype(str)
    
    #we will match based on lower characters
    list_car_manufacturers = [producer.lower() for producer in list_car_manufacturers]
    new_company_news = data.new_company.apply(lambda x: x.lower())
    
    # Create a regex pattern for the list of car manufacturers for joining
    pattern = '|'.join([f'\\b{re.escape(manufacturer)}\\b' for manufacturer in list_car_manufacturers])
    
    # Filter the DataFrame for partial matches
    filtered_indices = new_company_news[new_company_news.str.contains(pattern, case=False, na=False)].index
    
    #filtering dataframe based on indices to preserve captialization
    data_filtered = data.loc[filtered_indices]
    
    #removing new_company names longer than 40 characters - as the list has max 20 and makes no sense to include false results
    data_filtered = data_filtered[data_filtered['new_company'].str.len() < 40]

    return data_filtered

### Main code snippet for creating the prompts

#### Filtering to only keep NAATBatt companies and automotive OEMs

In [177]:
data_folder_path = 'data/'

#pasting results in a new df
prompt_raw_data = {
    'text': [],
    'initial_company': [],
    'new_company': []
}

# Create an empty DataFrame with the same structure
prompt_raw_data = pd.DataFrame(prompt_raw_data)

#iterating over files
for subfolder in os.listdir(data_folder_path):
    subfolder_path = os.path.join(data_folder_path, subfolder)

    # Check if it is a directory
    if os.path.isdir(subfolder_path):
        # Find the CSV file ending with ews.csv in the current subfolder
        csv_files = glob.glob(os.path.join(subfolder_path, '*_prompt_ready.csv'))
        
        # There should be exactly one CSV file per subfolder as per the given structure
        if csv_files:
            csv_file_path = csv_files[0]  # Get the path of the CSV file
            data = pd.read_csv(csv_file_path, 
                               sep = ',',
                               usecols=['text','initial_company','new_company'],
                               on_bad_lines = 'skip',
                               engine = 'pyarrow')

            #filtering the oems from the predefined list
            filtered_oems = select_oems(data, list_car_manufacturers)
            
            #appedning to new dataframe which will include all companies
            prompt_raw_data = pd.concat([prompt_raw_data, filtered_oems], ignore_index=True)

#these create a lot of self loops so we remove them
filter_strings = ['ford', 'morgan', 'rivian']
prompt_raw_data = prompt_raw_data[~prompt_raw_data.apply(lambda row: any(s in row['initial_company'].lower() and s in row['new_company'].lower() for s in filter_strings), axis=1)]

#parse results
prompt_raw_data.to_csv('oem_raw_prompt.csv')

#### Creating the Prompts

In [127]:
#reading the parsed intermediary results
prompt_raw_data = pd.read_csv('oem_raw_prompt.csv', index_col = 0)

In [180]:
#temporary list of prompts
list_prompts = []

for index,row in prompt_raw_data.iterrows():
    #cleaning the company name
    comp = row['initial_company'].replace('_',' ').replace('  ','').replace('+','')

    #cleaning the text
    text = row['text'].replace('\n','').replace('`','')
    url_pattern = r'(\S*https?://\S+|\S*www\.\S+)'
    text = re.sub(url_pattern, '', text)

    #adding the companies in the prompts
    prompt = prompt_template.format(text = text,
                                    comp_initial = comp,
                                    comp_new = row['new_company'])

    #adding in the temporary list of prompts
    list_prompts.append((comp, prompt, row['new_company']))

#parsing the prompts lists to csv
prompts = pd.DataFrame(list_prompts, columns = ['company', 'prompt', 'connection'])
prompts['tokens'] = prompts.prompt.apply(count_tokens)
prompts.to_csv('oem_prompts.csv')

KeyboardInterrupt: 

### Running the Inference with GPT 3.5 turbo

In [144]:
prompts = pd.read_csv('oem_prompts.csv')

In [146]:
#setting the token
client = openai.OpenAI(api_key="OPENAI_TOKEN")

In [181]:
def run_openai(prompt):
    '''
         This function calls the OpenAI API using the token
         and the prompt, and returns the answer
    '''
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0.1,
        max_tokens = 50,
        top_p = 1,
        frequency_penalty = 2,
        presence_penalty = 0,
        messages=[
            {
                "role": "system",
                "content": system_instruction
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        )
    
    return response.choices[0].message.content



In [None]:

#for parsing the results
results = {
    'company':[],
    'prompt':[],
    'connection':[],
    'tokens':[],
    'result':[]
}

#looping through the list of prompts to get streaming answers from all of them
for i, row in prompts.iterrows():
    company_a = row.company
    prompt = row.prompt
    company_b = row.connection
    tokens = row.tokens
    try:
        result = run_openai(prompt)
    except Exception as e:
        print(e)
        break

    results['company'].append(company_a)
    results['prompt'].append(prompt)
    results['connection'].append(company_b)
    results['tokens'].append(tokens)
    results['result'].append(result)
    
    if i % 1500 == 0:
        results_1 = pd.DataFrame(results)
        results_1.to_csv('final_results/'+str(i)+'-streaming.csv')
        results = {
            'company':[],
            'prompt':[],
            'connection':[],
            'tokens':[],
            'result':[]
        }
        print(i)

15000
