<a href="https://colab.research.google.com/github/IrisMeng9/2024_thematic_investing/blob/main/Transcript_bullish_bearish_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install --upgrade openai
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import glob
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import json

# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Initialize sentiment pipeline
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, truncation=True)

In [None]:
# Read all the parquet files from Renewable folder
folder_path = "//content/drive/MyDrive/capstone/transcript"

file_paths = glob.glob(os.path.join(folder_path, "*.parquet"))

combined_df = pd.DataFrame()

for file_path in file_paths:
    df = pd.read_parquet(file_path)
    combined_df = pd.concat([combined_df, df], ignore_index=True)
symbol_list = combined_df['symbol'].unique().tolist()
print(symbol_list)

In [None]:
combined_df

In [None]:
# Function to adjust sentiment classification
def adjust_sentiment_classification(text, sentiment, score):
    if "costs increased" in text or "expenses increased" in text:
        return "negative", score
    return sentiment, score

# Perform sentiment classification
results = []
for index, row in combined_df.iterrows():
    text = row['sentence_context']
    sentiment_result = sentiment_pipeline(text)[0]
    sentiment = sentiment_result['label']
    score = sentiment_result['score']
    adjusted_sentiment, adjusted_score = adjust_sentiment_classification(text, sentiment, score)
    results.append({
        **row,
        "classified_text": {
            "predicted_value": adjusted_sentiment,
            "prediction_probability": adjusted_score,
            "model_id": 'ProsusAI/finbert'
        }
    })

# Convert results to DataFrame
result_df = pd.DataFrame(results)

print("Sentiment classification completed")

# Extract classified text into separate columns
result_df['predicted_value'] = result_df['classified_text'].apply(lambda x: x['predicted_value'])
result_df['prediction_probability'] = result_df['classified_text'].apply(lambda x: x['prediction_probability'])


In [None]:
result_df.to_excel('/content/drive/MyDrive/capstone/transcript/result_df.xlsx')

## generate bullish/bearish statements for each company

In [None]:
import openai
from openai import OpenAI

openai.api_key = 'xxx'

client = OpenAI(api_key = 'xxx')

In [None]:
# Function to extract top and bottom sentences by company
def extract_top_bottom_sentences(company_symbol, df):
    company_df = df[df['symbol'] == company_symbol]
    top_positive = company_df[company_df['predicted_value'] == 'positive'].nlargest(10, 'prediction_probability')
    top_negative = company_df[company_df['predicted_value'] == 'negative'].nlargest(10, 'prediction_probability')
    return top_positive, top_negative

# Function to create a summary using OpenAI
def create_summary(symbol, top_positive, top_negative):
    positive_sentences = "\n".join([f"Positive Sentence {i+1}: {row['sentence_context']}" for i, row in top_positive.iterrows()])
    negative_sentences = "\n".join([f"Negative Sentence {i+1}: {row['sentence_context']}" for i, row in top_negative.iterrows()])

    prompt = f"""
    Company Symbol: {symbol}

    Bullish (Positive) Sentences:
    {positive_sentences}

    Bearish (Negative) Sentences:
    {negative_sentences}

    Please provide a summarized bullish and bearish write-up for the company {symbol} in 150 words.
    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.5
    )

    summary = response.choices[0].message
    return summary

In [None]:
# Extract and summarize sentences for each company
summaries = {}
for symbol in symbol_list:
    top_positive, top_negative = extract_top_bottom_sentences(symbol, result_df)
    summary = create_summary(symbol, top_positive, top_negative)
    summaries[symbol] = summary
    print(f"\nSummary for {symbol}:\n{summary}\n")


In [None]:

# summary output
string_summaries = {key: str(value) for key, value in summaries.items()}

with open('summaries.json', 'w') as f:
    json.dump(string_summaries, f, indent=4)

## Sector Summary

In [None]:
transcripts = []
for index, row in combined_df.iterrows():
    transcripts.append(row['sentence_context'])

Here are the bullish and bearish summaries of each company:
{summaries}

In [None]:
industry_writeup_prompt = f"""
 Here is the list of the companies in the AI sector we chose:
{", ".join(combined_df['symbol'].unique())}

Here are the transcripts of each company: {transcripts}

Based on the sentiment analysis above, please answer the following questions:
1. What are these AI companies saying about important industry trends?
2. What are companies in the AI sector saying about their growth outlook?
3. What opportunities and challenges do these companies face?
4. What risks should investors be aware of?
5. What are the AI companies focising on ESG criteria?
6. How does the AI companies performance vary between different market cycles?


Summarize your responses in a detailed industry write-up.
"""

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": industry_writeup_prompt}
    ],
    max_tokens=10000,
    temperature=0.5
)
# industry_writeup = response.choices[0].message
print("Industry write-up completed")
# print(industry_writeup)

In [None]:
industry_writeup = response.choices[0].message.content.strip()
print(industry_writeup)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Sample documents
documents = [
    "AI companies are investing heavily in machine learning and data science.",
    "Growth outlook for AI companies remains positive.",
    "Challenges in the AI sector include data privacy and security.",
    "Investors are optimistic about the opportunities in AI.",
    "AI companies face risks such as regulatory changes."
]

# Step 1: Preprocess the text (already clean in this example)

# Step 2: Vectorize the text
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(documents)

# Step 3: Apply LDA
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(dtm)

# Step 4: Display the topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 5)
