In [15]:
import pandas as pd
import nltk
import os
from nltk.data import load
from nltk import pos_tag, word_tokenize
from collections import Counter

#  Download NLTK resources (first time only)
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('tagsets')

#  Step 1: Ensure CSV exists
# Go one folder up from .ipynb_checkpoints â†’ NLP
base_folder = os.path.join(os.getcwd(), "..", "data")
os.makedirs(base_folder, exist_ok=True)

csv_path = os.path.join(base_folder, "data.csv")

# If CSV does not exist, create it automatically
if not os.path.exists(csv_path):
    sample_data = pd.DataFrame({
        'text': [
            "I love programming in Python.",
            "AI is the future of technology.",
            "Data Science is exciting and useful.",
            "Natural Language Processing is amazing.",
            "Machine Learning can solve complex problems."
        ]
    })
    sample_data.to_csv(csv_path, index=False)
    print(" data.csv created successfully!")

#  Step 2: Load the CSV safely
data = pd.read_csv(csv_path, header=0)
print(" Loaded CSV:")
print(data.head())

#  Step 3: POS Tagging Functions
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())

def get_pos_occurrence_freq(df, tag_list):
    feature_dict = {tag: [] for tag in tag_list}
    
    for sentence in df['text']:
        tokens = word_tokenize(sentence)
        pos_tags = pos_tag(tokens)
        counts = Counter(tag for word, tag in pos_tags)
        
        for tag in tag_list:
            feature_dict[tag].append(counts.get(tag, 0))
    
    return pd.DataFrame(feature_dict)

#  Step 4: Run POS feature extraction
tag_list = get_tagsets()
feature_df = get_pos_occurrence_freq(data, tag_list)

print("\n POS Features Extracted:")
print(feature_df.head())


 Loaded CSV:
                                           text
0                 I love programming in Python.
1               AI is the future of technology.
2          Data Science is exciting and useful.
3       Natural Language Processing is amazing.
4  Machine Learning can solve complex problems.

 POS Features Extracted:
   PRP$  VBG  FW  VB  POS  ''  VBP  VBN  JJ  WP  ...  IN  WP$  MD  NNPS  --  \
0     0    1   0   0    0   0    1    0   0   0  ...   1    0   0     0   0   
1     0    0   0   0    0   0    0    0   0   0  ...   1    0   0     0   0   
2     0    1   0   0    0   0    0    0   1   0  ...   0    0   0     0   0   
3     0    0   0   0    0   0    0    0   2   0  ...   0    0   0     0   0   
4     0    0   0   1    0   0    0    0   1   0  ...   0    0   1     0   0   

   JJS  JJR  SYM  UH  WDT  
0    0    0    0   0    0  
1    0    0    0   0    0  
2    0    0    0   0    0  
3    0    0    0   0    0  
4    0    0    0   0    0  

[5 rows x 45 columns]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
