<a href="https://colab.research.google.com/github/MehrdadDastouri/tatoeba_text_analysis/blob/main/tatoeba_text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pandas numpy scikit-learn matplotlib seaborn nltk



In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

class TatoebaTextAnalyzer:
    """Class for analyzing and preprocessing multilingual text data from Tatoeba."""

    def __init__(self):
        self.languages = ['english', 'spanish', 'french', 'german']
        self.stopwords = {lang: set(stopwords.words(lang)) for lang in self.languages}

    def load_sample_data(self) -> pd.DataFrame:
        """Load sample data similar to Tatoeba format."""
        data = {
            'text': [
                'Hello how are you today?',
                'Hola cómo estás hoy?',
                'Bonjour comment allez-vous aujourd\'hui?',
                'Hallo wie geht es dir heute?'
            ],
            'language': ['english', 'spanish', 'french', 'german']
        }
        return pd.DataFrame(data)

    def preprocess_text(self, text: str, language: str) -> str:
        """Preprocess text by removing special characters and stopwords."""
        # Convert to lowercase
        text = text.lower()

        # Remove special characters
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords if language is supported
        if language in self.stopwords:
            tokens = [word for word in tokens if word not in self.stopwords[language]]

        return ' '.join(tokens)

    def get_text_statistics(self, text: str) -> Dict:
        """Calculate basic statistics for a text."""
        words = word_tokenize(text)
        return {
            'word_count': len(words),
            'char_count': len(text),
            'avg_word_length': np.mean([len(word) for word in words]),
            'unique_words': len(set(words))
        }

    def analyze_language_distribution(self, df: pd.DataFrame) -> pd.DataFrame:
        """Analyze the distribution of languages in the dataset."""
        return df['language'].value_counts().reset_index()

    def get_most_common_words(self, texts: List[str], n: int = 10) -> List[Tuple[str, int]]:
        """Get the most common words across all texts."""
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(texts)
        words = vectorizer.get_feature_names_out()
        total_counts = X.sum(axis=0).A1
        word_freq = list(zip(words, total_counts))
        return sorted(word_freq, key=lambda x: x[1], reverse=True)[:n]

    def visualize_language_distribution(self, df: pd.DataFrame):
        """Visualize the distribution of languages."""
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df, x='language')
        plt.title('Distribution of Languages in Dataset')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    def visualize_text_lengths(self, df: pd.DataFrame):
        """Visualize the distribution of text lengths."""
        df['text_length'] = df['text'].str.len()
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df, x='language', y='text_length')
        plt.title('Distribution of Text Lengths by Language')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

def main():
    # Initialize analyzer
    analyzer = TatoebaTextAnalyzer()

    # Load sample data
    df = analyzer.load_sample_data()

    # Preprocess texts
    df['processed_text'] = df.apply(
        lambda row: analyzer.preprocess_text(row['text'], row['language']),
        axis=1
    )

    # Get text statistics
    stats = [analyzer.get_text_statistics(text) for text in df['text']]
    stats_df = pd.DataFrame(stats)
    print("\nText Statistics:")
    print(stats_df.describe())

    # Get language distribution
    lang_dist = analyzer.analyze_language_distribution(df)
    print("\nLanguage Distribution:")
    print(lang_dist)

    # Get most common words
    common_words = analyzer.get_most_common_words(df['processed_text'])
    print("\nMost Common Words:")
    for word, count in common_words:
        print(f"{word}: {count}")

    # Visualizations
    analyzer.visualize_language_distribution(df)
    analyzer.visualize_text_lengths(df)

if __name__ == "__main__":
    main()