In [None]:
!pip install torch transformers neo4j matplotlib torch-geometric spacy networkx
!pip install stanza torchtext torch-scatter torch-sparse torch-cluster torch-spline-conv
from google.colab import drive
drive.mount('/content/drive')
import spacy
spacy.cli.download("en_core_web_sm")
import os
import re
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
from torch.optim import AdamW
from neo4j import GraphDatabase
from torch.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

# Set environment variables with the correct Neo4j credentials using Python
os.environ['NEO4J_URI'] = 'neo4j+s://1cc6fff5.databases.neo4j.io'
os.environ['NEO4J_USER'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = 'meX8mx5oh5L8uJvK8XYBtdO68aFJoDYtlTvhkMZWx-k'

# Define a custom dataset class for BERT
class BertDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Neo4j connection details
def connect_to_neo4j(uri, user, password):
    try:
        driver = GraphDatabase.driver(uri, auth=(user, password))
        print("Connected to Neo4j successfully.")
        return driver
    except Exception as e:
        print(f"An error occurred while connecting to Neo4j: {e}")
        return None

# Updated graph creation function with meaningful node names and properties
def create_knowledge_graph(driver, entities, relationships):
    if driver is None:
        print("Cannot create graph, Neo4j driver not connected.")
        return

    with driver.session() as session:
        try:
            # Use MERGE to create unique nodes with specific attributes
            for entity in entities:
                session.run(
                    """
                    MERGE (n:FinancialEntity {name: $name})
                    ON CREATE SET n.type = $type, n.year = $year, n.created = timestamp()
                    """,
                    name=entity['name'], type=entity.get('type', 'Unknown'), year=entity.get('year', 'Unknown')
                )

            # Use descriptive relationship types with meaningful properties
            for rel in relationships:
                session.run(
                    """
                    MATCH (a:FinancialEntity {name: $from_name}), (b:FinancialEntity {name: $to_name})
                    MERGE (a)-[r:CAUSES {type: $rel_type, created: timestamp()}]->(b)
                    """,
                    from_name=rel[0], to_name=rel[1], rel_type=rel[2]
                )
            print("Knowledge graph created successfully with named nodes and relationships.")
        except Exception as e:
            print(f"An error occurred while creating the knowledge graph: {e}")

# Load data from nested directory structure
def load_data_from_directory(base_dir):
    data = []
    if not os.path.exists(base_dir):
        print(f"Path does not exist: {base_dir}")
        return data

    for subdir, _, files in os.walk(base_dir):
        for filename in files:
            if filename == 'full-submission.txt':
                file_path = os.path.join(subdir, filename)
                with open(file_path, 'r') as file:
                    text = file.read()
                    if text.strip():  # Ensure non-empty content
                        data.append({'text': text, 'label': 0})  # Replace 'label' logic as needed
                    else:
                        print(f"Empty content in file: {filename} at {subdir}")

    if not data:
        print("No valid data loaded from the directory.")
    return data

# Combine data from multiple years
def combine_data_from_years(base_dir, selected_years):
    combined_data = []
    for year in selected_years:
        matching_folders = [f for f in os.listdir(os.path.join(base_dir, '10-K')) if f'-{year[-2:]}-' in f]
        if not matching_folders:
            print(f"No folder found for the year {year}.")
            continue
        year_folder = matching_folders[0]
        year_dir = os.path.join(base_dir, '10-K', year_folder)
        data = load_data_from_directory(year_dir)
        combined_data.extend(data)
    return combined_data

# Preprocessing function
def preprocess_data(data):
    # Example: tokenization, cleaning, etc.
    # Return processed data
    return data

def plot_metrics(training_losses, validation_losses):
    """
    Plot the training and validation losses over epochs.
    """
    plt.figure(figsize=(10, 5))
    plt.plot(training_losses, label='Training Loss')
    plt.plot(validation_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss over Epochs')
    plt.legend()
    plt.grid(True)
    plt.show()

def train_bert(data, base_dir, epochs=3, batch_size=8, learning_rate=5e-5):
    # Initialize TensorBoard
    writer = SummaryWriter(log_dir=os.path.join(base_dir, 'runs/bert_experiment'))

    # Check if there's enough data
    if len(data) < 2:
        print(f"Not enough data to perform train-test split. Found {len(data)} sample(s).")
        return

    # Example preprocessing call
    processed_data = preprocess_data(data)
    sentences = [item['text'] for item in processed_data]
    labels = [item['label'] for item in processed_data]

    # Split data into training and validation sets
    try:
        train_sentences, val_sentences, train_labels, val_labels = train_test_split(
            sentences, labels, test_size=0.2, random_state=42
        )
    except ValueError as e:
        print(f"Error in train-test split: {e}")
        return

    # Define tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

    # Tokenize data
    train_encodings = tokenizer(train_sentences, truncation=True, padding=True, max_length=512, return_tensors='pt')
    val_encodings = tokenizer(val_sentences, truncation=True, padding=True, max_length=512, return_tensors='pt')

    # Create data loaders
    train_dataset = BertDataset(train_encodings, train_labels)
    val_dataset = BertDataset(val_encodings, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training setup
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Check if CUDA is available and set up mixed precision training accordingly
    use_cuda = torch.cuda.is_available()
    scaler = GradScaler(enabled=use_cuda)  # Enable scaler only if CUDA is available

    training_losses = []
    validation_losses = []

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            if use_cuda:
                with autocast(device_type='cuda'):
                    outputs = model(**batch)
                    loss = outputs.loss
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                # Training without CUDA
                outputs = model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        training_losses.append(avg_train_loss)

        # Log training loss to TensorBoard
        writer.add_scalar('Training Loss', avg_train_loss, epoch)

        # Validation loop
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                outputs = model(**batch)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        validation_losses.append(avg_val_loss)

        # Log validation loss to TensorBoard
        writer.add_scalar('Validation Loss', avg_val_loss, epoch)

        print(f"Epoch {epoch + 1} completed - Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Save the model
    model_save_path = os.path.join(base_dir, "bert_model")
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"Model and tokenizer saved to {model_save_path}")

    # Plot the training and validation losses
    plot_metrics(training_losses, validation_losses)

    # Close the TensorBoard writer
    writer.close()

def get_available_years(base_dir):
    """
    List available years based on folder naming conventions for the selected company.
    """
    ten_k_path = os.path.join(base_dir, '10-K')
    if not os.path.exists(ten_k_path):
        print(f"Path does not exist: {ten_k_path}")
        return []

    years = set()
    for folder_name in os.listdir(ten_k_path):
        match = re.search(r'-(\d{2})-', folder_name)  # Extract the year part like '23' in '0001652044-23-000016'
        if match:
            year = int(match.group(1))
            if year <= 23:  # Assumes files from 2000 onwards
                years.add(f"20{year:02d}")
            else:
                years.add(f"19{year:02d}")

    return sorted(years)

def main():
    # Prompt user for company selection
    companies = ['AAPL', 'AMZN', 'GOOGL', 'MSFT']
    print(f"Available companies: {companies}")
    company = input("Select the company for analysis: ").strip().upper()
    if company not in companies:
        print("Invalid company selected.")
        return

    base_dir = f'/content/drive/MyDrive/datasets/sec-edgar-filings/{company}'
    print(f"Searching for files in: {base_dir}")

    # Get available years for the selected company
    available_years = get_available_years(base_dir)
    print(f"Available years for {company}: {available_years}")
    if not available_years:
        print("No available years found.")
        return

    # Prompt user for multiple year selection
    selected_years = input("Select the years for analysis (comma separated, e.g., 2016,2017): ").strip().split(',')
    combined_data = combine_data_from_years(base_dir, selected_years)

    if not combined_data:
        print("No data to process. Exiting.")
        return

    # Train BERT model using combined data
    train_bert(combined_data, base_dir, epochs=30)

    # Connect to Neo4j and create the knowledge graph
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USER")
    password = os.getenv("NEO4J_PASSWORD")
    driver = connect_to_neo4j(uri, user, password)

    # Example entities and relationships for creating a causality knowledge graph
    entities = [
        {"name": "Revenue", "type": "Metric", "year": "2023"},
        {"name": "Profit", "type": "Metric", "year": "2023"},
        {"name": "Expenses", "type": "Metric", "year": "2023"},
        {"name": "Market Growth", "type": "External Factor", "year": "2023"}
    ]

    relationships = [
        ("Revenue", "Profit", "CAUSES"),
        ("Expenses", "Profit", "DECREASES"),
        ("Market Growth", "Revenue", "INCREASES")
    ]

    create_knowledge_graph(driver, entities, relationships)

 # New print statements added here
    print("\nProcessing complete. You can now visualize the graph in Neo4j Browser.")
    print("Sample Cypher queries:")
    print("\n1. For Market Growth as a Cause use:\n\n MATCH (n:FinancialEntity {name: 'Market Growth'})-[r:CAUSES]->(m:FinancialEntity) RETURN n, r, m;\n")
    print("\n2. Causal Relationships Among Core Financial Metrics use:\n\n MATCH (n:FinancialEntity)-[r:CAUSES]->(m:FinancialEntity) WHERE n.name IN ['Revenue', 'Profit', 'Expenses', 'Growth', 'Loss'] AND m.name IN ['Revenue', 'Profit', 'Expenses', 'Growth', 'Loss'] RETURN n, r, m;")
    print("\n3. For Revenue Relationships use: \n \nMATCH (n:FinancialEntity {name: 'Revenue'})-[r]->(m:FinancialEntity) RETURN n, r, m;")
    print("\n4. To view all relationships: \n\nMATCH (n) RETURN n LIMIT 25\n\n")
if __name__ == "__main__":
    main()


