All the necessary imports

In [2]:
import spacy
import pandas as pd
import os
from pathlib import Path

In [3]:
# Load Serbian language model
nlp = spacy.load('hr_core_news_lg')

In [26]:
def read_letters(folder_path):
    """
    Reads all letters from the specified folder
    Returns a DataFrame with filename and content
    """
    letters = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                letters.append({
                    'filename': filename,
                    'content': content
                })

    print(f"Loaded {len(letters)} letters.")
    
    return pd.DataFrame(letters)

In [27]:
letters_df = read_letters("letters")

Loaded 12 letters.


### Task 2:
Splitting the letters into sentences and performing the analysis.

In [30]:
def analyze_sentences(df):
    """
    Analyzes sentences in each letter, focusing on length and complexity
    Returns DataFrame with detailed sentence analysis
    """
    sentence_analysis = []
    
    for _, row in df.iterrows():
        doc = nlp(row['content'])
        
        # Analyze each sentence
        for sent in doc.sents:
            # Count words (excluding punctuation)
            word_count = len([token for token in sent if not token.is_punct])
            
            # Calculate average word length (excluding punctuation)
            avg_word_length = sum(len(token.text) for token in sent if not token.is_punct) / word_count if word_count > 0 else 0
            
            sentence_analysis.append({
                'filename': row['filename'],
                'sentence': sent.text,
                'word_count': word_count,
                'avg_word_length': round(avg_word_length, 2),
                'is_long': word_count > 20  # Flag for long sentences
            })
    
    return pd.DataFrame(sentence_analysis)

def get_sentence_statistics(sentence_df):
    """
    Calculates statistical summary of sentence analysis
    """
    stats = {
        'total_sentences': len(sentence_df),
        'avg_sentence_length': sentence_df['word_count'].mean(),
        'long_sentences_count': len(sentence_df[sentence_df['is_long']]),
        'max_sentence_length': sentence_df['word_count'].max(),
        'min_sentence_length': sentence_df['word_count'].min()
    }
    return stats


def analyze_letters_sentences(letters_df):
    """
    Main function to analyze all letters and their sentences
    """
    # Analyze sentences
    sentence_analysis = analyze_sentences(letters_df)
    
    # Get statistics
    stats = get_sentence_statistics(sentence_analysis)
    
    # Get notable sentences (longer than 20 words)
    long_sentences = sentence_analysis[sentence_analysis['is_long']].sort_values('word_count', ascending=False)
    
    return letters_df, sentence_analysis, stats, long_sentences

def print_analysis_summary(stats, long_sentences):
    print("\nSentence Analysis in Andrić's Letters:")
    print("=====================================")
    print(f"Total number of analyzed sentences: {stats['total_sentences']}")
    print(f"Average sentence length: {stats['avg_sentence_length']:.2f} words")
    print(f"Number of long sentences (>20 words): {stats['long_sentences_count']}")
    print(f"Longest sentence has {stats['max_sentence_length']} words")
    print("=====================================")
    
    len_long_sentences = stats['long_sentences_count']
    # check whether long sentences is not empty
    if len_long_sentences != 0:
        # print first min between len of long_sentences and 3
        print("\nExamples of the longest sentences:")
        print("=====================================")

        for _, row in long_sentences.head(min(len_long_sentences, 7)).iterrows():
            print(f"Letter: {row['filename']}")
            print(f"Sentence: {row['sentence']}")
            print(f"Word count: {row['word_count']}")
            print(f"Avg. word length: {row['avg_word_length']}")
            print()



Andrić writes relatively short sentences, there are only around 15% of sentences that have more than 20 words. The average sentence length is 12 words. His writting style is simplistic and easy to read, there are not many unnecessary adjectives in them.

In [32]:
letters_df, sentence_analysis, stats, long_sentences = analyze_letters_sentences(letters_df)
sentences = sentence_analysis['sentence']
print_analysis_summary(stats, long_sentences)


Sentence Analysis in Andrić's Letters:
Total number of analyzed sentences: 82
Average sentence length: 11.72 words
Number of long sentences (>20 words): 12
Longest sentence has 50 words

Examples of the longest sentences:
Letter: letter_2.txt
Sentence: U tome, kako treba dočekati bolest i podnositi Ьоl,
ја bih mogao biti Vaš učenik i primati Vaše savete а ne
Vi moje.
Sad, kad је sve па dobrom putu, nadam se da ćemo
se još ovoga leta videti i porazgovarati, i radujem se
unapred.
Word count: 50
Avg. word length: 3.66

Letter: letter_2.txt
Sentence: Gledaću svakako da dođem па jedan dan u
Zagreb, kad budem polazio па odmor; а dotle, držite se
dobro i budite tvrdi kao uvek, dragi, stari mој Tugomire.
Grli Vas i pozdravlja
Vaš
Ivo

Word count: 40
Avg. word length: 3.73

Letter: letter_5.txt
Sentence: ,
šaljem vam u prilogu 50 franaka i molim vas da
budete ljubazni i da mi pošaljete Antologiju Lirike "Misli"
(poslednje izdanje, sa slikаmа) i Antologiju Ljubavnе Lirike 
od g. Bož. Kovačevića

### Task 3:
Tokenizing the words and removing the stopwords.

In [40]:
def tokenize(sentences):
    """
    Tokenize the input sentences
    
    Parameters:
    sentences (list): List of sentences to tokenize
    
    Returns:
    list: List of lists where each inner list contains tokens for one sentence
    """
    tokenized_sentences = []
    
    for sentence in sentences:
        doc = nlp(sentence)
        # Get all tokens that aren't punctuation or whitespace
        tokens = [token.text.lower() for token in doc 
                if not token.is_punct and not token.is_space]
        tokenized_sentences.append(tokens)
    
    return tokenized_sentences

def remove_stop_words(sentences):
    """
    Remove stop words from tokenized sentences
    
    Parameters:
    sentences (list): List of lists containing tokenized sentences
    
    Returns:
    list: List of lists with stop words removed
    """
    cleaned_sentences = []
    
    for sentence in sentences:
        # Process each token to check if it's a stop word
        cleaned_tokens = [token for token in sentence 
                        if not nlp.vocab[token].is_stop]
        cleaned_sentences.append(cleaned_tokens)
    
    return cleaned_sentences

In [44]:
tokenized_sentences = tokenize(sentences)
cleaned_sentences = remove_stop_words(tokenized_sentences)
print("Tokenized sentence: \n")
print(tokenized_sentences[3])
print("##############################################")
print("After removing stop words: \n")
print(cleaned_sentences[3])

Tokenized sentence: 

['ја', 'sam', 'proveo', 'vrlo', 'rđavu', 'zimu', 'јеr', 'sam', 'dva', 'puta', 'рrеbоlео', 'grip', 'i', 'sada', 'vučem', 'posledice']
##############################################
After removing stop words: 

['ја', 'proveo', 'rđavu', 'zimu', 'јеr', 'dva', 'puta', 'рrеbоlео', 'grip', 'sada', 'vučem', 'posledice']
