## Financial Domain N-gram Language Model Pipelinefor audio transcription CSV data

### IMPORT LIBRARIES

### MODEL CONFIGURATION

In [None]:
@dataclass
class ModelConfig:
    """Model configuration parameters."""
    n: int = 3
    smoothing: str = 'backoff'
    alpha: float = 0.1
    discount: float = 0.75  # For Kneser-Ney smoothing
    min_frequency: int = 2
    max_vocab_size: int = 50000
    test_split: float = 0.2

### DATA PREPROCESSING

In [None]:
class FinancialDataPreprocessor:
    """Text preprocessor optimized for financial earnings call transcripts."""

    def __init__(self, config: ModelConfig):
        self.config = config
        self.vocab = {}
        self.word_to_id = {}
        self.id_to_word = {}
        self.unk_token = "<UNK>"
        self.start_token = "<START>"
        self.end_token = "<END>"
        self.pad_token = "<PAD>"

        # Pre-compile regex patterns for faster processing
        self._currency_pattern = re.compile(r'[\$â‚¬][\d,.]+(?:\.\d+)?[MBK]?\b')
        self._percent_pattern = re.compile(r'\b[\d,.]+(?:\.\d+)?%\b')
        self._year_pattern = re.compile(r'\b(19|20)\d{2}\b')
        self._number_pattern = re.compile(r'\b[\d,.]+\b')
        self._url_pattern = re.compile(r'http\S+|www\S+')

        # Common financial abbreviations and terms
        self._financial_terms = {
            'ebit': 'EBIT',
            'eps': 'EPS',
            'roa': 'ROA',
            'roe': 'ROE',
            'gdp': 'GDP',
            'npl': 'NPL',
            'ncl': 'NCL',
            'costco': '<COMPANY>',
            'citi': '<COMPANY>',
            'brexit': '<EVENT>',
        }

    def _normalize_financial_text(self, text: str) -> str:
        """Normalize financial text with domain-specific processing."""
        # Store original for certain patterns
        original_text = text

        # Convert to lowercase for processing
        text = text.lower()

        # Replace financial terms
        for term, replacement in self._financial_terms.items():
            text = re.sub(rf'\b{term}\b', replacement, text)

        # Handle currency amounts (keep the pattern but normalize)
        text = self._currency_pattern.sub('<CURRENCY>', text)

        # Handle percentages
        text = self._percent_pattern.sub('<PERCENT>', text)

        # Handle years
        text = self._year_pattern.sub('<YEAR>', text)

        # Handle large numbers (but keep small counts)
        text = self._number_pattern.sub(self._normalize_number, text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def _normalize_number(self, match):
        """Normalize numbers based on size."""
        num_str = match.group(0).replace(',', '')

        try:
            # Try to parse as float
            num = float(num_str)

            # Normalize very large numbers
            if num >= 1000000:  # Millions
                return '<LARGE_NUMBER>'
            elif num >= 1000:  # Thousands
                return '<MEDIUM_NUMBER>'
            else:
                return num_str  # Keep small numbers as is
        except:
            return '<NUMBER>'

    def tokenize(self, text: str) -> List[str]:
        """Tokenize text with financial domain awareness."""
        normalized = self._normalize_financial_text(text)

        # Simple whitespace tokenization (preserving special tokens)
        tokens = normalized.split()

        return tokens

    def build_vocabulary(self, texts: List[str]) -> Counter:
        """Build vocabulary from financial transcripts."""
        print("Building vocabulary from financial transcripts...")

        all_tokens = []

        for text in texts:
            tokens = self.tokenize(text)
            all_tokens.extend(tokens)

        word_counts = Counter(all_tokens)

        print(f"Raw vocabulary size: {len(word_counts)}")

        # Filter vocabulary
        filtered_words = [
            word for word, count in word_counts.most_common(self.config.max_vocab_size)
            if count >= self.config.min_frequency
        ]

        # Build mappings
        special_tokens = [self.pad_token, self.unk_token, self.start_token, self.end_token]
        self.word_to_id = {token: idx for idx, token in enumerate(special_tokens)}

        for idx, word in enumerate(filtered_words, start=len(special_tokens)):
            self.word_to_id[word] = idx

        self.id_to_word = {idx: word for word, idx in self.word_to_id.items()}

        print(f"Final vocabulary size: {len(self.word_to_id)}")
        print(f"Top 15 financial terms: {word_counts.most_common(15)}")

        return word_counts

    def encode(self, text: str) -> List[int]:
        """Encode text to token IDs."""
        tokens = self.tokenize(text)
        unk_id = self.word_to_id.get(self.unk_token, 1)
        ids = [self.word_to_id.get(token, unk_id) for token in tokens]
        return ids

    def decode(self, ids: List[int]) -> str:
        """Decode token IDs back to text."""
        tokens = [self.id_to_word.get(idx, self.unk_token) for idx in ids]
        return ' '.join(tokens)

### N-GRAM LANGUAGE MODEL

In [None]:
class NGramLanguageModel:
    """N-gram language model for financial text prediction."""

    def __init__(self, config: ModelConfig):
        self.config = config
        self.n = config.n
        self.smoothing = config.smoothing
        self.alpha = config.alpha
        self.discount = config.discount

        self.ngram_counts = defaultdict(lambda: defaultdict(int))
        self.context_counts = defaultdict(int)
        self.vocab_size = 0
        self._prob_cache = {}

    def train(self, texts: List[List[int]], vocab_size: int) -> None:
        """Train n-gram model on financial transcripts."""
        print(f"Training {self.n}-gram model with {self.smoothing} smoothing...")

        self.vocab_size = vocab_size

        total_ngrams = 0

        for token_ids in texts:
            # Skip very short sequences
            if len(token_ids) < 2:
                continue

            # Add padding for start and end
            padded = [0] * (self.n - 1) + token_ids + [3]  # 0=PAD, 3=END

            # Extract n-grams
            for i in range(len(padded) - self.n + 1):
                ngram = tuple(padded[i:i + self.n])
                context = ngram[:-1]
                target = ngram[-1]

                self.ngram_counts[context][target] += 1
                self.context_counts[context] += 1
                total_ngrams += 1

        print(f"Total n-grams extracted: {total_ngrams:,}")
        print(f"Unique contexts: {len(self.ngram_counts):,}")

    def _get_probability(self, token: int, context: Tuple) -> float:
        """Get probability with specified smoothing."""
        cache_key = (token, context)
        if cache_key in self._prob_cache:
            return self._prob_cache[cache_key]

        if self.smoothing == 'laplace':
            count = self.ngram_counts.get(context, {}).get(token, 0)
            total = self.context_counts.get(context, 0)
            prob = (count + self.alpha) / (total + self.alpha * self.vocab_size)

        elif self.smoothing == 'backoff':
            count = self.ngram_counts.get(context, {}).get(token, 0)
            total = self.context_counts.get(context, 0)

            if count > 0 and total > 0:
                prob = count / total
            else:
                # Backoff to lower-order n-gram
                if len(context) > 0:
                    lower_context = context[1:] if len(context) > 1 else ()
                    lower_count = self.ngram_counts.get(lower_context, {}).get(token, 0)
                    lower_total = self.context_counts.get(lower_context, 0)

                    if lower_total > 0:
                        prob = lower_count / lower_total
                    else:
                        # Ultimate backoff to uniform
                        prob = 1.0 / self.vocab_size
                else:
                    prob = 1.0 / self.vocab_size

        elif self.smoothing == 'kneser_ney':
            count = self.ngram_counts.get(context, {}).get(token, 0)
            total = self.context_counts.get(context, 0)

            if total > 0:
                # Calculate discount
                d = self.discount
                unique_followers = len(self.ngram_counts.get(context, {}))

                if count > 0:
                    prob = max(count - d, 0) / total
                    lambda_factor = d * unique_followers / total
                else:
                    prob = 0
                    lambda_factor = d * unique_followers / total

                # Backoff probability
                if len(context) > 0:
                    lower_context = context[1:] if len(context) > 1 else ()
                    continuation_count = sum(1 for ctx in self.ngram_counts
                                           if token in self.ngram_counts[ctx])
                    total_continuations = len(self.ngram_counts)

                    if total_continuations > 0:
                        backoff_prob = continuation_count / total_continuations
                    else:
                        backoff_prob = 1.0 / self.vocab_size
                else:
                    backoff_prob = 1.0 / self.vocab_size

                prob = prob + lambda_factor * backoff_prob
            else:
                prob = 1.0 / self.vocab_size

        else:
            raise ValueError(f"Unknown smoothing: {self.smoothing}")

        prob = max(prob, 1e-10)
        self._prob_cache[cache_key] = prob
        return prob

    def evaluate(self, texts: List[List[int]]) -> Dict[str, float]:
        """Evaluate model on test data."""
        print(f"Evaluating on {len(texts)} transcripts...")

        total_log_prob = 0.0
        total_tokens = 0
        novel_ngrams = 0
        total_ngrams = 0

        for token_ids in texts:
            if len(token_ids) < self.n:
                continue

            # Add padding
            padded = [0] * (self.n - 1) + token_ids + [3]

            for i in range(len(padded) - self.n + 1):
                ngram = tuple(padded[i:i + self.n])
                context = ngram[:-1]
                target = ngram[-1]

                # Check for novel n-grams
                if target not in self.ngram_counts.get(context, {}):
                    novel_ngrams += 1

                prob = self._get_probability(target, context)
                total_log_prob += np.log(prob)
                total_tokens += 1
                total_ngrams += 1

        # Calculate metrics
        avg_log_prob = total_log_prob / total_tokens if total_tokens > 0 else 0
        perplexity = np.exp(-avg_log_prob) if total_tokens > 0 else float('inf')
        novel_rate = novel_ngrams / total_ngrams if total_ngrams > 0 else 0

        metrics = {
            'perplexity': perplexity,
            'avg_log_prob': avg_log_prob,
            'loss': -avg_log_prob,
            'total_tokens': total_tokens,
            'novel_ngrams': novel_ngrams,
            'novel_ngram_rate': novel_rate
        }

        print(f"  Perplexity: {perplexity:.2f}")
        print(f"  Novel n-gram rate: {novel_rate:.4f} ({novel_ngrams}/{total_ngrams})")

        return metrics

    def predict_next_words(self, context_text: str, preprocessor, top_k: int = 5) -> List[Tuple[str, float]]:
        """Predict next possible words given context."""
        # Encode context
        context_ids = preprocessor.encode(context_text)

        # Get last n-1 tokens for context
        if len(context_ids) >= (self.n - 1):
            context = tuple(context_ids[-(self.n - 1):])
        else:
            context = tuple(context_ids)

        # Get probabilities for all tokens
        predictions = []
        for token_id in range(self.vocab_size):
            # Skip special tokens for prediction
            if token_id not in [0, 2, 3]:  # Not PAD, START, or END
                prob = self._get_probability(token_id, context)
                word = preprocessor.id_to_word.get(token_id, preprocessor.unk_token)
                predictions.append((word, prob))

        # Sort and return top k
        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[:top_k]

    def generate(self, preprocessor, max_length: int = 50,
                 temperature: float = 1.0) -> str:
        """Generate financial text."""
        start_id = preprocessor.word_to_id.get(preprocessor.start_token, 2)

        sequence = [start_id]

        for _ in range(max_length):
            # Get context
            if len(sequence) >= (self.n - 1):
                context = tuple(sequence[-(self.n - 1):])
            else:
                context = tuple(sequence)

            # Get probabilities for all tokens
            probs = np.zeros(self.vocab_size)
            for token_id in range(self.vocab_size):
                probs[token_id] = self._get_probability(token_id, context)

            # Apply temperature
            if temperature != 1.0:
                probs = np.power(probs, 1.0 / temperature)

            # Normalize and sample
            probs = probs / probs.sum()

            # Avoid sampling special tokens
            mask = np.ones(self.vocab_size, dtype=bool)
            mask[0] = mask[2] = False  # Don't sample PAD or START

            # Apply mask and renormalize
            masked_probs = probs * mask
            if masked_probs.sum() > 0:
                masked_probs = masked_probs / masked_probs.sum()
                next_token = np.random.choice(self.vocab_size, p=masked_probs)
            else:
                next_token = 3  # END token

            sequence.append(next_token)

            if next_token == 3:  # END token
                break

        # Decode, skipping the start token
        tokens = [preprocessor.id_to_word.get(idx, preprocessor.unk_token)
                 for idx in sequence[1:]]
        return ' '.join(tokens)

### PIPELINE MANAGER

In [None]:
class PipelineManager:
    """Complete pipeline for financial n-gram models."""

    def __init__(self, output_dir: str = "./financial_ngram_results"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.config = ModelConfig()
        self.preprocessor = None
        self.models = {}
        self.results = {}

    def load_financial_data(self, filepath: str) -> List[str]:
        """Load financial transcript data from CSV."""
        print(f"Loading data from {filepath}...")

        try:
            # Try to load as CSV
            df = pd.read_csv(filepath, sep='|', header=0)

            # Check if transcript column exists
            if 'transcript' in df.columns:
                texts = df['transcript'].dropna().tolist()
            elif len(df.columns) >= 3:  # Assume 3rd column is transcript
                texts = df.iloc[:, 2].dropna().tolist()
            else:
                raise ValueError("Could not find transcript column")

            print(f"Loaded {len(texts)} financial transcripts")

            # Show sample
            print("\nSample transcripts:")
            for i in range(min(3, len(texts))):
                print(f"  {i+1}. {texts[i][:80]}...")

            return texts

        except Exception as e:
            print(f"Error loading CSV: {e}")
            print("Trying alternative loading method...")

            # Fallback: read as text file
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            # Skip header if exists
            if 'transcript' in lines[0].lower():
                lines = lines[1:]

            # Extract transcripts (assuming format: ...|transcript)
            texts = []
            for line in lines:
                parts = line.strip().split('|')
                if len(parts) >= 3:
                    texts.append(parts[-1])

            print(f"Loaded {len(texts)} transcripts (fallback method)")
            return texts

    def run_pipeline(self, data_file: str) -> Dict:
        """Run complete pipeline on financial data."""
        print("=" * 80)
        print("FINANCIAL EARNINGS CALL N-GRAM PIPELINE")
        print("=" * 80)

        # Step 1: Load and preprocess data
        print("\n[1] Loading Financial Data")
        print("-" * 80)

        texts = self.load_financial_data(data_file)

        if not texts:
            print("No data loaded. Exiting.")
            return {}

        # Step 2: Preprocessing
        print("\n[2] Preprocessing Financial Text")
        print("-" * 80)

        self.preprocessor = FinancialDataPreprocessor(self.config)
        word_counts = self.preprocessor.build_vocabulary(texts)

        # Save vocabulary stats
        self._save_vocab_stats(word_counts)

        # Encode all texts
        print("Encoding transcripts...")
        encoded_texts = [self.preprocessor.encode(text) for text in texts]

        # Filter out empty sequences
        encoded_texts = [seq for seq in encoded_texts if len(seq) > 0]

        print(f"Total encoded sequences: {len(encoded_texts)}")
        print(f"Average sequence length: {np.mean([len(seq) for seq in encoded_texts]):.1f}")

        # Train/test split
        split_idx = int(len(encoded_texts) * (1 - self.config.test_split))
        train_texts = encoded_texts[:split_idx]
        test_texts = encoded_texts[split_idx:]

        print(f"Train: {len(train_texts)}, Test: {len(test_texts)}")

        # Step 3: Train and evaluate different models
        print("\n[3] Training and Evaluating Models")
        print("-" * 80)

        self.results['comparisons'] = {}

        # Test different configurations
        configurations = [
            (2, 'backoff'),
            (2, 'laplace'),
            (3, 'backoff'),
            (3, 'kneser_ney'),
        ]

        best_perplexity = float('inf')
        best_model_info = None

        for n, smoothing in configurations:
            config = ModelConfig(
                n=n,
                smoothing=smoothing,
                alpha=self.config.alpha,
                discount=self.config.discount,
                min_frequency=self.config.min_frequency,
                max_vocab_size=self.config.max_vocab_size
            )

            model_key = f"{n}-gram_{smoothing}"
            print(f"\nTraining {model_key}...")

            model = NGramLanguageModel(config)
            model.train(train_texts, len(self.preprocessor.word_to_id))

            # Evaluate
            print(f"Evaluating {model_key}...")
            metrics = model.evaluate(test_texts)

            self.results['comparisons'][model_key] = {
                'model': model,
                'metrics': metrics
            }

            # Track best model
            if metrics['perplexity'] < best_perplexity:
                best_perplexity = metrics['perplexity']
                best_model_info = (model_key, model, metrics)

        # Step 4: Detailed analysis with best model
        print("\n[4] Detailed Analysis with Best Model")
        print("-" * 80)

        if best_model_info:
            best_key, best_model, best_metrics = best_model_info
            self.results['best_model'] = {
                'key': best_key,
                'model': best_model,
                'metrics': best_metrics
            }

            print(f"\nBest Model: {best_key}")
            print(f"Perplexity: {best_metrics['perplexity']:.2f}")
            print(f"Novel n-gram rate: {best_metrics['novel_ngram_rate']:.4f}")

            # Generate samples
            print("\nGenerating financial text samples:")
            for i in range(3):
                generated = best_model.generate(self.preprocessor, max_length=30, temperature=0.8)
                print(f"  Sample {i+1}: {generated}")

            # Show predictions on actual context
            print("\nPredicting next words from actual context:")
            sample_contexts = [
                "revenue increased by",
                "earnings per share",
                "we are continuing to invest",
            ]

            for context in sample_contexts:
                predictions = best_model.predict_next_words(context, self.preprocessor, top_k=3)
                print(f"\n  Context: '{context}'")
                print(f"  Top predictions:")
                for word, prob in predictions:
                    print(f"    - {word}: {prob:.4f}")

        # Step 5: Create visualizations
        print("\n[5] Creating Visualizations")
        print("-" * 80)
        self._create_visualizations()

        # Step 6: Save results
        print("\n[6] Saving Results")
        print("-" * 80)
        self._save_results()

        print("\n" + "=" * 80)
        print("PIPELINE COMPLETE")
        print("=" * 80)

        return self.results

    def _save_vocab_stats(self, word_counts: Counter):
        """Save vocabulary statistics."""
        stats = {
            'vocab_size': len(self.preprocessor.word_to_id),
            'total_unique_words': len(word_counts),
            'top_30_financial_terms': word_counts.most_common(30),
            'vocab_distribution': {
                '1-5 occurrences': sum(1 for count in word_counts.values() if 1 <= count <= 5),
                '6-20 occurrences': sum(1 for count in word_counts.values() if 6 <= count <= 20),
                '21-100 occurrences': sum(1 for count in word_counts.values() if 21 <= count <= 100),
                '100+ occurrences': sum(1 for count in word_counts.values() if count > 100),
            }
        }

        with open(self.output_dir / 'vocab_stats.json', 'w') as f:
            json.dump(stats, f, indent=2)

        print("Vocabulary statistics saved")

    def _create_visualizations(self):
        """Create visualizations for model comparison."""
        if not self.results.get('comparisons'):
            return

        models = list(self.results['comparisons'].keys())

        # Extract metrics
        perplexities = []
        novel_rates = []

        for model in models:
            metrics = self.results['comparisons'][model]['metrics']
            perplexities.append(metrics['perplexity'])
            novel_rates.append(metrics['novel_ngram_rate'])

        # Create comparison plot
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('Financial N-Gram Model Comparison', fontsize=16, fontweight='bold')

        # Perplexity comparison
        ax = axes[0, 0]
        colors = ['skyblue' if '2-gram' in m else 'lightcoral' for m in models]
        bars = ax.bar(range(len(models)), perplexities, color=colors, edgecolor='black')
        ax.set_ylabel('Perplexity', fontsize=12)
        ax.set_title('Perplexity by Model (Lower is Better)', fontsize=13)
        ax.set_xticks(range(len(models)))
        ax.set_xticklabels(models, rotation=45, ha='right')
        ax.grid(axis='y', alpha=0.3)

        for bar, val in zip(bars, perplexities):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{val:.1f}', ha='center', va='bottom')

        # Novel n-gram rate
        ax = axes[0, 1]
        bars = ax.bar(range(len(models)), novel_rates, color=colors, edgecolor='black')
        ax.set_ylabel('Novel N-gram Rate', fontsize=12)
        ax.set_title('Novel N-gram Rate (Generalization)', fontsize=13)
        ax.set_xticks(range(len(models)))
        ax.set_xticklabels(models, rotation=45, ha='right')
        ax.set_ylim([0, 1])
        ax.grid(axis='y', alpha=0.3)

        for bar, val in zip(bars, novel_rates):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{val:.3f}', ha='center', va='bottom')

        # Relationship between perplexity and novel rate
        ax = axes[1, 0]
        scatter = ax.scatter(novel_rates, perplexities, s=100, alpha=0.6, edgecolors='black')
        ax.set_xlabel('Novel N-gram Rate', fontsize=12)
        ax.set_ylabel('Perplexity', fontsize=12)
        ax.set_title('Perplexity vs Novel Rate', fontsize=13)
        ax.grid(True, alpha=0.3)

        # Add labels for each point
        for i, model in enumerate(models):
            ax.annotate(model, (novel_rates[i], perplexities[i]),
                       xytext=(5, 5), textcoords='offset points', fontsize=9)

        # Summary table
        ax = axes[1, 1]
        ax.axis('off')

        table_data = [['Model', 'Perplexity', 'Novel Rate', 'Tokens']]
        for i, model in enumerate(models):
            metrics = self.results['comparisons'][model]['metrics']
            table_data.append([
                model,
                f"{metrics['perplexity']:.1f}",
                f"{metrics['novel_ngram_rate']:.3f}",
                f"{metrics['total_tokens']:,}"
            ])

        table = ax.table(cellText=table_data, cellLoc='center',
                        loc='center', colWidths=[0.3, 0.2, 0.2, 0.2])
        table.auto_set_font_size(False)
        table.set_fontsize(10)
        table.scale(1, 1.5)

        # Style table
        for i in range(len(table_data)):
            for j in range(4):
                if i == 0:
                    table[(i, j)].set_facecolor('#40466e')
                    table[(i, j)].set_text_props(weight='bold', color='white')
                elif i % 2 == 0:
                    table[(i, j)].set_facecolor('#f5f5f5')

        plt.tight_layout()
        plt.savefig(self.output_dir / 'model_comparison.png', dpi=150, bbox_inches='tight')
        plt.close()

        print("Visualizations saved")

    def _save_results(self):
        """Save all pipeline results."""
        # Prepare results for saving
        results_to_save = {
            'best_model': {
                'key': self.results.get('best_model', {}).get('key', ''),
                'metrics': self.results.get('best_model', {}).get('metrics', {})
            } if 'best_model' in self.results else {},
            'comparisons': {
                model: {'metrics': data['metrics']}
                for model, data in self.results.get('comparisons', {}).items()
            }
        }

        # Save JSON results
        with open(self.output_dir / 'results.json', 'w') as f:
            json.dump(results_to_save, f, indent=2)

        # Save summary report
        self._create_summary_report()

        print(f"\nAll results saved to: {self.output_dir}")

    def _create_summary_report(self):
        """Create a text summary report."""
        report_path = self.output_dir / 'summary_report.txt'

        with open(report_path, 'w') as f:
            f.write("=" * 80 + "\n")
            f.write("FINANCIAL N-GRAM LANGUAGE MODEL - SUMMARY REPORT\n")
            f.write("=" * 80 + "\n\n")

            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

            # Best model
            if 'best_model' in self.results:
                best = self.results['best_model']
                f.write("BEST MODEL:\n")
                f.write("-" * 40 + "\n")
                f.write(f"Configuration: {best['key']}\n\n")

                metrics = best['metrics']
                f.write("Performance Metrics:\n")
                f.write(f"  Perplexity: {metrics['perplexity']:.2f}\n")
                f.write(f"  Novel n-gram rate: {metrics['novel_ngram_rate']:.4f}\n")
                f.write(f"  Total test tokens: {metrics['total_tokens']:,}\n")
                f.write(f"  Average log probability: {metrics['avg_log_prob']:.4f}\n\n")

            # All models comparison
            f.write("ALL MODELS COMPARISON:\n")
            f.write("-" * 40 + "\n")

            for model, data in self.results.get('comparisons', {}).items():
                metrics = data['metrics']
                f.write(f"\n{model}:\n")
                f.write(f"  Perplexity: {metrics['perplexity']:.2f}\n")
                f.write(f"  Novel rate: {metrics['novel_ngram_rate']:.4f}\n")
                f.write(f"  Test tokens: {metrics['total_tokens']:,}\n")

            f.write("\n" + "=" * 80 + "\n")

        print("Summary report saved")

### EXECUTION

In [None]:
def main():
    """Main execution function."""
    # Initialize pipeline
    pipeline = PipelineManager(output_dir="./financial_ngram_results")

    # Run pipeline on your data
    results = pipeline.run_pipeline(
        data_file="/content/train.csv"  # Update with your actual file path
    )

    # Show final summary
    if results:
        print("\n" + "=" * 80)
        print("FINAL SUMMARY")
        print("=" * 80)

        if 'best_model' in results:
            best = results['best_model']
            print(f"\nBest Model: {best['key']}")
            print(f"Perplexity: {best['metrics']['perplexity']:.2f}")
            print(f"Novel n-gram rate: {best['metrics']['novel_ngram_rate']:.4f}")

        print("\nModel Comparison:")
        for model, data in results.get('comparisons', {}).items():
            metrics = data['metrics']
            print(f"  {model}: Perplexity={metrics['perplexity']:.2f}, "
                  f"Novel={metrics['novel_ngram_rate']:.3f}")

    return results

In [None]:
if __name__ == "__main__":
    main()

FINANCIAL EARNINGS CALL N-GRAM PIPELINE

[1] Loading Financial Data
--------------------------------------------------------------------------------
Loading data from /content/train.csv...
Loaded 1261521 financial transcripts

Sample transcripts:
  1. I have to say that I'm very encouraged by the underlying momentum across our fra...
  2. Our Fixed Income franchise continued to benefit from strong client engagement ac...
  3. As a sign of our commitment to Mexico, a market where we have real scale and con...

[2] Preprocessing Financial Text
--------------------------------------------------------------------------------
Building vocabulary from financial transcripts...
Raw vocabulary size: 158794
Final vocabulary size: 50004
Top 15 financial terms: [('the', 1632468), ('to', 1071467), ('and', 1026343), ('of', 873625), ('we', 782604), ('in', 707860), ('that', 596183), ('our', 584325), ('a', 553396), ('is', 316647), ('as', 283880), ('for', 280686), ('on', 280509), ('are', 252250), ('have