In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Download NLTK data (run only once)
nltk.download('punkt')
nltk.download('stopwords')

# Optional: List of common English verbs to remove
common_verbs = {
    'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing',
    'will', 'would', 'shall', 'should', 'can', 'could', 'may', 'might', 'must',
    'go', 'goes', 'went', 'gone', 'going',
    'get', 'gets', 'got', 'gotten', 'getting',
    'make', 'makes', 'made', 'making',
    'say', 'says', 'said', 'saying',
    'see', 'saw', 'seen', 'seeing'
}

def clean_and_count_to_df(text):
    # Lowercase and remove non-letter characters
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Filter out stopwords and verbs
    stop_words = set(stopwords.words('english'))
    filtered = [
        word for word in tokens 
        if word not in stop_words and word not in common_verbs
    ]

    # Count words
    word_counts = Counter(filtered)

    # Convert to DataFrame
    df = pd.DataFrame(word_counts.items(), columns=['Keyword', 'Count'])
    df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    return df

# Example usage
text = """Your long input text goes here. It can be a full document, article, or anything."""
df_result = clean_and_count_to_df(text)

print(df_result.head(10))  # Top 10 keywords


    Keyword  Count
0      long      1
1     input      1
2      text      1
3      full      1
4  document      1
5   article      1
6  anything      1


[nltk_data] Downloading package punkt to /Users/jayasri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayasri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df_result

Unnamed: 0,Keyword,Count
0,long,1
1,input,1
2,text,1
3,full,1
4,document,1
5,article,1
6,anything,1
