# In this file we will derive the Linguistic features from the transcripts like cue count sentence length etc


### Importing Libraries

In [None]:
import os
import pandas as pd
import re
import spacy

# --- Explanation of Libraries ---
# os: To interact with the operating system, like reading file directories.
# pandas: The primary tool for creating and managing our data in tables (DataFrames).
# re: Stands for Regular Expressions, which we will use to clean text from the transcripts.
# spacy: A powerful Natural Language Processing (NLP) library. We will use it for:
#   - Sentence segmentation (to calculate average sentence length).
#   - Part-of-speech tagging (to identify nouns and pronouns).
#   - Dependency parsing (to analyze sentence structure for parse tree depth).

# --- Download spaCy English Model ---
# If you haven't downloaded the spaCy English model before, you will need to run this command.
# You can run it directly in a notebook cell by adding "!" at the beginning,
# or run it in your terminal.
# !python -m spacy download en_core_web_sm

# --- Load the spaCy Model ---
# We load the model into a variable, commonly named 'nlp'.
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")

: 