In [1]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import contractions

In [8]:
input_text = """"An Alabama mother who was killed by her boyfriend in a murder-suicide was shielded by her 10-year-old son,
                 who was seriously injured in the Sunday evening shooting, police said. Police in Sheffield, Ala., identified
                 the fatal victim as 38-year-old Ashley Lynn McClung and her boyfriend as 40-year-old Christopher Narmore, who
                 subsequently died by suicide, Chief Ricky Terry of the Sheffield Police Department confirms to PEOPLE.
                 (A GoFundMe and a memorial set in her name, shared by McClung's loved ones on social media, identify her as
                 Ashley Lynn Newton and Ashley Lynn Newton McClung, respectively.) Her son, who WAFF identified as 10-year-old
                 Cayson McClung citing a public statement by his school principal, was shot in the face and arm as he tried to
                 protect her, according to Terry. Ashley and Narmore were in an argument that “escalated” on Sunday evening,
                 Terry tells PEOPLE. “The boyfriend had a gun and tried to shoot the mother and [Cayson] got into a tussle
                 with the boyfriend,” he says. “He raised his arm up and that’s when the boyfriend shot the juvenile, then
                 he shot the mother, then he killed himself.” ” """

In [13]:
input_text = input_text.replace('-', ' ')

In [14]:
# creating an empty list
expanded_words = []    
for word in input_text.split():
  # using contractions.fix to expand the shortened words
  expanded_words.append(contractions.fix(word))   
   
expanded_text1= ' '.join(expanded_words)

In [15]:
# Tokenize sentences
sentences = sent_tokenize(expanded_text1)

# Initialize lists to store data
sentence_numbers = []
words = []
pos_tags = []
lemmas = []

# Initialize sentence number
sentence_number = 0

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Mapping NLTK POS tags to WordNet POS tags
pos_tag_map = {
    'N': wordnet.NOUN,
    'V': wordnet.VERB,
    'R': wordnet.ADV,
    'J': wordnet.ADJ
}

# Iterate through sentences
for sentence in sentences:
    sentence_number += 1  # Increment sentence number for each sentence
    
    # Tokenize words and remove punctuation
    words_in_sentence = [word for word in word_tokenize(sentence) if word.isalnum()]
    
    # Assign POS tags to words
    tagged_words = pos_tag(words_in_sentence)
    
    # Extract word, POS tag, and lemmatization information
    for word, pos_tag_word in tagged_words:
        sentence_numbers.append(sentence_number)
        words.append(word)
        pos_tags.append(pos_tag_word)
        
        # Perform lemmatization based on POS tag
        pos = pos_tag_word[0].upper()  # First character of the NLTK POS tag
        if pos in pos_tag_map:
            lemma = lemmatizer.lemmatize(word, pos_tag_map[pos])
        else:
            lemma = lemmatizer.lemmatize(word)
        lemmas.append(lemma)

# Create a DataFrame
data = {
    'Sentence Number': sentence_numbers,
    'Word': words,
    'POS Tag': pos_tags,
    'Lemmatization': lemmas
}

df = pd.DataFrame(data)

In [16]:
df.head(50)

Unnamed: 0,Sentence Number,Word,POS Tag,Lemmatization
0,1,An,DT,An
1,1,Alabama,NNP,Alabama
2,1,mother,NN,mother
3,1,who,WP,who
4,1,was,VBD,be
5,1,killed,VBN,kill
6,1,by,IN,by
7,1,her,PRP$,her
8,1,boyfriend,NN,boyfriend
9,1,in,IN,in


In [17]:
df.to_csv('dataset_Sample.csv', index=False)

In [18]:
print(expanded_text1)

"An Alabama mother who was killed by her boyfriend in a murder suicide was shielded by her 10 year old son, who was seriously injured in the Sunday evening shooting, police said. Police in Sheffield, Ala., identified the fatal victim as 38 year old Ashley Lynn McClung and her boyfriend as 40 year old Christopher Narmore, who subsequently died by suicide, Chief Ricky Terry of the Sheffield Police Department confirms to PEOPLE. (A GoFundMe and a memorial set in her name, shared by McClung's loved ones on social media, identify her as Ashley Lynn Newton and Ashley Lynn Newton McClung, respectively.) Her son, who WAFF identified as 10 year old Cayson McClung citing a public statement by his school principal, was shot in the face and arm as he tried to protect her, according to Terry. Ashley and Narmore were in an argument that “escalated” on Sunday evening, Terry tells PEOPLE. “The boyfriend had a gun and tried to shoot the mother and [Cayson] got into a tussle with the boyfriend,” he says

In [4]:
df=pd.read_csv('dataset_Sample.csv')
df.head(15)

Unnamed: 0,Sentence Number,Word,POS Tag,Lemmatization,Entity_tag
0,1,An,DT,An,O
1,1,Alabama,NNP,Alabama,S-VIC
2,1,mother,NN,mother,O
3,1,who,WP,who,O
4,1,was,VBD,be,O
5,1,killed,VBN,kill,O
6,1,by,IN,by,O
7,1,her,PRP$,her,O
8,1,boyfriend,NN,boyfriend,O
9,1,in,IN,in,O
