In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
import statsmodels.api as smplot

In [9]:
data = pd.read_csv('key developments.csv')
print(data.head())

     keydevid  companyid   companyname  \
0  1821559127      18511  3i Group plc   
1  1822662100      18511  3i Group plc   
2  1867777550      18511  3i Group plc   
3  1825170395      18511  3i Group plc   
4  1830432803      18511  3i Group plc   

                                            headline  keydeveventtypeid  \
0                        3i Group plc - Special Call                194   
1  Morellato S.p.A. agreed to acquire CHRIST Juwe...                 80   
2  Morellato S.p.A. completed the acquisition of ...                 81   
3                        3i Group plc - Special Call                194   
4  3i Group plc Presents at Tech Innovation Confe...                 51   

                          eventtype  keydevtoobjectroletypeid objectroletype  \
0                     Special Calls                         1         Target   
1     M&A Transaction Announcements                         4         Seller   
2          M&A Transaction Closings                     

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1942227 entries, 0 to 1942226
Data columns (total 15 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   keydevid                  int64  
 1   companyid                 int64  
 2   companyname               object 
 3   headline                  object 
 4   keydeveventtypeid         int64  
 5   eventtype                 object 
 6   keydevtoobjectroletypeid  int64  
 7   objectroletype            object 
 8   announcedate              object 
 9   enterdate                 object 
 10  entereddateutc            object 
 11  lastmodifieddate          object 
 12  speffectivedate           object 
 13  sptodate                  float64
 14  gvkey                     float64
dtypes: float64(2), int64(4), object(9)
memory usage: 222.3+ MB


In [11]:
gvkey = pd.read_csv('gvkeys.csv')

# Group by gvkey and ticker and keep only gvkey and ticker columns
grouped_gvkey = gvkey.groupby(['gvkey', 'TICKER']).size().reset_index().rename(columns={0: 'count'})
grouped_gvkey = grouped_gvkey[['gvkey', 'TICKER']]

# Display the grouped gvkey dataframe
print(grouped_gvkey)

       gvkey TICKER
0       1004    AIR
1       1045    AAL
2       1050   CECE
3       1050   CECO
4       1075    PNW
...      ...    ...
4181  345980   WISH
4182  347007   IBRX
4183  349337   MDAI
4184  349530   NXTP
4185  349972   INDP

[4186 rows x 2 columns]


In [12]:
# Check for and remove duplicate entries in the grouped_gvkey dataframe
grouped_gvkey.drop_duplicates(subset=['gvkey'], inplace=True)

# Perform a left join using gvkey as the matching ID
data = pd.merge(data, grouped_gvkey[['gvkey', 'TICKER']], on='gvkey', how='left')

# Display the merged dataframe
print(data)

           keydevid   companyid   companyname  \
0        1821559127       18511  3i Group plc   
1        1822662100       18511  3i Group plc   
2        1867777550       18511  3i Group plc   
3        1825170395       18511  3i Group plc   
4        1830432803       18511  3i Group plc   
...             ...         ...           ...   
1942222  2147481082  2147481079           NaN   
1942223  1837753492  2147481182           NaN   
1942224  1843462326  2147481182           NaN   
1942225  2147482189  2147482184           NaN   
1942226  1837759204  2147483631           NaN   

                                                  headline  keydeveventtypeid  \
0                              3i Group plc - Special Call                194   
1        Morellato S.p.A. agreed to acquire CHRIST Juwe...                 80   
2        Morellato S.p.A. completed the acquisition of ...                 81   
3                              3i Group plc - Special Call                194   
4     

In [13]:
# Filter rows where the headline is not null
data = data.dropna(subset=['TICKER'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230934 entries, 441 to 1824199
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   keydevid                  230934 non-null  int64  
 1   companyid                 230934 non-null  int64  
 2   companyname               230934 non-null  object 
 3   headline                  230934 non-null  object 
 4   keydeveventtypeid         230934 non-null  int64  
 5   eventtype                 230934 non-null  object 
 6   keydevtoobjectroletypeid  230934 non-null  int64  
 7   objectroletype            230934 non-null  object 
 8   announcedate              230934 non-null  object 
 9   enterdate                 230934 non-null  object 
 10  entereddateutc            230934 non-null  object 
 11  lastmodifieddate          230934 non-null  object 
 12  speffectivedate           230934 non-null  object 
 13  sptodate                  0 non-null     

In [14]:
total_observations = len(data)

# Define the number of subsets
num_subsets = 50

# Calculate the length of each subset
subset_length = total_observations // num_subsets
remainder = total_observations % num_subsets  # Remainder for the last subset

# Create 50 subsets with approximately equal lengths
subsets = []
start_index = 0
for i in range(1, num_subsets + 1):
    # Determine the length of the current subset
    if i == num_subsets:
        # For the last subset, add the remainder to the subset length
        current_subset_length = subset_length + remainder
    else:
        current_subset_length = subset_length
    
    # Sample rows from the DataFrame with the calculated subset length
    subset = data.iloc[start_index:start_index + current_subset_length]
    
    # Assign the subset to a variable with a name like sub1, sub2, ..., sub50
    globals()[f'sub{i}'] = subset
    
    # Append the subset to the list of subsets
    subsets.append(subset)
    
    # Update the start index for the next subset
    start_index += current_subset_length

In [15]:
sub45.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4618 entries, 1317857 to 1362249
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   keydevid                  4618 non-null   int64  
 1   companyid                 4618 non-null   int64  
 2   companyname               4618 non-null   object 
 3   headline                  4618 non-null   object 
 4   keydeveventtypeid         4618 non-null   int64  
 5   eventtype                 4618 non-null   object 
 6   keydevtoobjectroletypeid  4618 non-null   int64  
 7   objectroletype            4618 non-null   object 
 8   announcedate              4618 non-null   object 
 9   enterdate                 4618 non-null   object 
 10  entereddateutc            4618 non-null   object 
 11  lastmodifieddate          4618 non-null   object 
 12  speffectivedate           4618 non-null   object 
 13  sptodate                  0 non-null      float64
 14 

In [7]:
from openai import OpenAI
import pandas as pd

# Define your OpenAI API key
api_key = ""
client = OpenAI(api_key=api_key)

# Assuming 'data' is your original DataFrame
# Create an empty list to store the results
results = []

# Loop through each headline in the dataset
for headline in sub39['headline']:
    # Create a request to the Completions endpoint
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"Compulsorily classify the following statement (entire) as either one of these - negative, positive, or neutral: {headline}",
        max_tokens=100
    )
    
    # Extract the generated completion text
    completion_text = response.choices[0].text.strip()
    
    # Append the completion text to the results list
    results.append(completion_text)

# Create a new column 'Sentiment' in the original DataFrame and assign sentiment analysis results
sub39['Sentiment'] = results

sub39.to_csv('sub39.csv', index=False)

# Print the first few rows of the DataFrame with the new 'Sentiment' column
sub39_sample.head()

Unnamed: 0,keydevid,companyid,companyname,headline,keydeveventtypeid,eventtype,keydevtoobjectroletypeid,objectroletype,announcedate,enterdate,entereddateutc,lastmodifieddate,speffectivedate,sptodate,gvkey,Sentiment
1497831,1842990978,674368498,Moonfire Ventures,Filigran announced that it has received €5 mil...,83,Private Placements,3,Buyer,2023-06-13,2023-06-13,14JUN2023:03:02:00.000000,13JUN2023:23:03:00.000000,14JUN2023:03:15:12.000000,,,Neutral.
1489947,1844766672,667566903,Buildots Limited,Buildots Names Jessica Herrala as Head of Nort...,16,Executive/Board Changes - Other,1,Target,2023-06-21,2023-06-21,22JUN2023:00:45:00.000000,21JUN2023:20:47:00.000000,22JUN2023:01:00:17.000000,,,Neutral
1477039,1826078328,657539967,"Zentiva Group, a.s.","Zentiva Group, a.s. Presents at BIO-Europe Spr...",51,Company Conference Presentations,1,Target,2023-02-15,2023-02-15,15FEB2023:17:03:00.000000,28FEB2023:11:03:00.000000,20MAR2023:06:15:14.000000,,,Neutral
1477395,1838796247,657882688,"E2open Parent Holdings, Inc.","E2open Parent Holdings, Inc., Annual General M...",62,Annual General Meeting,1,Target,2023-05-26,2023-05-26,26MAY2023:20:57:00.000000,26MAY2023:16:57:00.000000,07JUL2023:00:00:00.000000,,36299.0,Neutral
1489222,1833056827,667394509,Battronics,Battronics Presents at Electric & Hybrid Vehic...,51,Company Conference Presentations,1,Target,2023-04-13,2023-04-13,13APR2023:16:03:00.000000,13APR2023:12:03:00.000000,23MAY2023:00:00:00.000000,,,Positive


In [17]:
from openai import OpenAI
import pandas as pd

# Define your OpenAI API key
api_key = ""
client = OpenAI(api_key=api_key)

# Iterate over each subset
for i in range(1, 51):
    # Load the subset data (assuming it's stored in variables sub1, sub2, ..., sub50)
    subset = globals()[f"sub{i}"].copy()  # Make a copy to avoid modifying the original DataFrame
    
    # Create an empty list to store the results
    results = []

    # Loop through each headline in the subset
    for headline in subset['headline']:
        # Create a request to the Completions endpoint
        response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=f"Compulsorily classify the following statement (entire) as either one of these - negative, positive, or neutral: {headline}",
            max_tokens=100
        )

        # Extract the generated completion text
        completion_text = response.choices[0].text.strip()

        # Append the completion text to the results list
        results.append(completion_text)

    # Create a new column 'Sentiment' in the subset DataFrame and assign sentiment analysis results
    subset['Sentiment'] = results

    # Save the subset DataFrame to a CSV file with a unique filename
    subset.to_csv(f'sub{i}_sentiment.csv', index=False)

    # Print progress
    print(f"Subset {i} sentiment analysis completed.")

Subset 1 sentiment analysis completed.
Subset 2 sentiment analysis completed.
Subset 3 sentiment analysis completed.
Subset 4 sentiment analysis completed.
Subset 5 sentiment analysis completed.
Subset 6 sentiment analysis completed.
Subset 7 sentiment analysis completed.
Subset 8 sentiment analysis completed.
Subset 9 sentiment analysis completed.
Subset 10 sentiment analysis completed.
Subset 11 sentiment analysis completed.
Subset 12 sentiment analysis completed.
Subset 13 sentiment analysis completed.
Subset 14 sentiment analysis completed.
Subset 15 sentiment analysis completed.
Subset 16 sentiment analysis completed.
Subset 17 sentiment analysis completed.
Subset 18 sentiment analysis completed.
Subset 19 sentiment analysis completed.
Subset 20 sentiment analysis completed.
Subset 21 sentiment analysis completed.
Subset 22 sentiment analysis completed.
Subset 23 sentiment analysis completed.
Subset 24 sentiment analysis completed.
Subset 25 sentiment analysis completed.
Subset 26