1. Imports:

're'  :      The regular expression module used for text processing.  


'Counter' from 'collections'  :      A specialized dictionary for counting hashable objects, which simplifies counting word frequencies.

In [58]:
import re
from collections import Counter

2. Function Definition: extract_unigrams(file_path) :

file_path Parameter  : This is the path to the text file from which unigrams (single words) will be extracted.


3. Reading the File :

Opens the file in read mode with UTF-8 encoding and reads the entire content into the text variable.

 4. Tokenizing the Text :

* Converts the text to lowercase to normalize it (making the extraction case-insensitive).
 
* Uses a regular expression (\b\w+\b) to find all words. The \b denotes word boundaries, and \w+ matches sequences of word characters (letters and digits).

5. Creating Unigrams :

In this case, the unigrams are just the words extracted from the text since unigrams are essentially single words.


6. Counting Frequencies :

Counter creates a dictionary where the keys are the unigrams and the values are their respective counts (how many times each unigram appears in the text

7. Error Handling :

Catches specific exceptions like PermissionError (if you don't have permission to access the file) and FileNotFoundError (if the file path is incorrect). The general Exception handler catches any other unexpected errors.

8. Main Execution :

* Sets the path to your text file.
    
* Calls the extract_unigrams function with this path.

* If unigrams are successfully extracted, prints the unigram frequencies.

In [57]:
def extract_unigrams(file_path):
    try:
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()


        
        # Tokenize the text into words
        words = re.findall(r'\b\w+\b', text.lower())  # Normalize to lowercase
        
        # Create unigrams (words themselves)
        unigrams = words
        
        # Count the frequency of each unigram
        unigram_freq = Counter(unigrams)
        
        return unigram_freq
    except PermissionError:
        print("Permission denied: Cannot access the file. Check if you have read permissions.")
    except FileNotFoundError:
        print("File not found. Make sure the file path is correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    # Path to your file
file_path = r'C:\Users\vishn\OneDrive\Desktop\gram\sample.txt'
unigrams = extract_unigrams(file_path)
if unigrams:
    print(unigrams)




Counter({'quid': 4, 'est': 3, 'ipsum': 2, 'etiam': 2, 'in': 2, 'si': 2, 'quidem': 2, 'id': 2, 'utilitatis': 1, 'causa': 1, 'amicitia': 1, 'quaesita': 1, 'lorem': 1, 'dolor': 1, 'sit': 1, 'amet': 1, 'consectetur': 1, 'adipiscing': 1, 'elit': 1, 'collatio': 1, 'igitur': 1, 'ista': 1, 'te': 1, 'nihil': 1, 'iuvat': 1, 'honesta': 1, 'oratio': 1, 'socratica': 1, 'platonis': 1, 'primum': 1, 'nostrane': 1, 'potestate': 1, 'meminerimus': 1, 'duo': 1, 'reges': 1, 'constructio': 1, 'interrete': 1, 'iucunda': 1, 'memoria': 1, 'praeteritorum': 1, 'malorum': 1, 'inquit': 1, 'tollerem': 1, 'sed': 1, 'relinquo': 1, 'an': 1, 'nisi': 1, 'populari': 1, 'fama': 1, 'quamquam': 1, 'licebit': 1, 'iis': 1, 'existimare': 1, 'qui': 1, 'legerint': 1, 'summum': 1, 'a': 1, 'vobis': 1, 'bonum': 1, 'voluptas': 1, 'dicitur': 1, 'at': 1, 'hoc': 1, 'eo': 1, 'm': 1, 'refert': 1, 'tamen': 1, 'quo': 1, 'modo': 1, 'sequatur': 1, 'repugnet': 1, 'vident': 1, 'iam': 1, 'absurdum': 1, 'maximum': 1, 'malum': 1, 'neglegi': 1})
