### Analyze Text With Python
	 
#### Project Overview 💡

In this project, you'll build a simple text analyzer in Python. You'll learn how to process text, count words, sentences, and characters, and identify the most frequent word. This is a great exercise in string manipulation and working with dictionaries.
Challenge Yourself!

#### Task:

Write a Python script that takes a user-inputted block of text and analyzes it by calculating the number of characters, words, and sentences. Additionally, determine the most frequently used word and calculate the average word and sentence length.

#### Expected Output:

The program should output text statistics, including:

* Total Characters
* Total Words
* Total Sentences
* Most Frequent Word
* Average Word Length
* Average Sentence Length

In [34]:
import re
import string

class Parser:

    def SentParse(self, paragraph:str) -> list:

        exclude_apostrophes = string.punctuation.replace("'", "")

        sentence_enders = r"[" + re.escape(exclude_apostrophes) + r"]+"

        sentences = re.split(sentence_enders + r"\s*", paragraph)

        return sentences

    def SentCount(self, sentence_list:list) -> int:

        count = 0

        for sentence in sentence_list:

            count += 1

        return count

    def WordParse(self, paragraph:str) -> list:  
        
        # Split by spaces or punctuation marks
        split_pattern = r"[\s" + re.escape(string.punctuation) + r"]+"

        # Split the paragraph using the pattern
        tokens = re.split(split_pattern, paragraph)

        # Filter out any empty strings that might result from consecutive delimiters
        tokens = [token for token in tokens if token]

        return tokens
    
    def WordCount(self, words:list) -> int:

        count = 0

        for word in range(len(words)):

            count += 1
            
        return count
        
"""
    def WordFreq(self, s:str) -> int:

    def AvgWordLen(self, s:str) -> int:

    def AvgSentLen(self, s:str) -> int:
""" 

'\n    def WordFreq(self, s:str) -> int:\n\n    def AvgWordLen(self, s:str) -> int:\n\n    def AvgSentLen(self, s:str) -> int:\n'

In [40]:
!pip install spacy



In [35]:
import string
import spacy
from collections import Counter

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

class Parser:

    def SentParse(self, paragraph:str) -> list:

        doc = nlp(paragraph)
        
        sentences = [sent.text for sent in doc.sents]

        return sentences
        
    def SentCount(self, sentence_list:list) -> int:

        count = 0

        for sentence in sentence_list:

            count += 1

        return count

    def WordParse(self, paragraph:str) -> list:  

        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)

        # Remove punctuation from the string
        text_without_punctuation = paragraph.translate(translator)
        
        # Process the text with spaCy
        doc = nlp(text_without_punctuation)

        # Iterate through the Doc object to get individual tokens (words)
        words = [token.text for token in doc]

        return words

    def WordCount(self, words:list) -> int:

        count = 0

        for word in range(len(words)):

            count += 1
            
        return count
        
    def MostFreqWord (self, paragraph:str) -> str:

        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)

        # Remove punctuation from the string
        text_without_punctuation = paragraph.translate(translator)

        doc = nlp(text_without_punctuation)

        # Create a list to store the words (excluding stop words and punctuation)
        all_words = []
        for token in doc:
            if not token.is_stop and not token.is_punct:
                all_words.append(token.text.lower())  # Convert to lowercase for case-insensitive counting

        word_counts = Counter(all_words)

        most_common = word_counts.most_common(1)[0]

        return most_common        
        

    def AvgWordLen(self, paragraph:str) -> int:

        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)

        # Remove punctuation from the string
        text_without_punctuation = paragraph.translate(translator)

        doc = nlp(text_without_punctuation)

        # Create a list to store the words (excluding stop words and punctuation)
        all_words = []
        for token in doc:
            if not token.is_stop and not token.is_punct:
                all_words.append(token.text.lower())  # Convert to lowercase for case-insensitive counting

        word_counts = Counter(all_words)

        total_length = 0
        total_count = 0
        
        for word, count in word_counts.items():
            total_length += len(word) * count
            total_count += count

        if total_count > 0:
            average_length = total_length / total_count
        else:
            print("No words to calculate the average length from.")

        return average_length
    

    def AvgSentLen(self, paragraph:str) -> int:

        if not paragraph:
            return 0.0  # Handle empty input

        doc = nlp(paragraph)
        
        sentences = list(doc.sents)  # Convert the generator to a list

        total_words = 0
        num_sentences = len(sentences)

        for sent in sentences:
            # Tokenize each sentence to count the words
            words = [token for token in sent if not token.is_space] # Exclude spaces
            total_words += len(words)

        if num_sentences > 0:
            average_words_per_sentence = total_words / num_sentences
            return average_words_per_sentence
        else:
            return 0.0 # Handle case with no sentences
            

In [42]:
parser = Parser()

paragraph = r"This is the first sentence. Here's the second sentence! And this is the third? Some sentences might have abbreviations like Mr. Smith."
print(f"Paragraph: {paragraph}")

sentences = parser.SentParse(paragraph)
sent_count = parser.SentCount(sentences)
print(f"The sentence count is {sent_count} sentences.")

words = parser.WordParse(paragraph)
word_count = parser.WordCount(words)
print(f"The word count is {word_count} words.")

most_freq_word = parser.MostFreqWord(paragraph)
print(f"The most frequent word is {most_freq_word}")

avg_word_length = parser.AvgWordLen(paragraph)
print(f"The average word length is {avg_word_length:.2f} characters.")

avg_sentence_length = parser.AvgSentLen(paragraph)
print(f"The average sentence length is {avg_sentence_length:.2f} words.")

Paragraph: This is the first sentence. Here's the second sentence! And this is the third? Some sentences might have abbreviations like Mr. Smith.
The sentence count is 4 sentences.
The word count is 22 words.
The most frequent word is ('sentence', 2)
The average word length is 6.67 characters.
The average sentence length is 6.75 words.
