<a href="https://colab.research.google.com/github/Meenalbagare/File-Transfer/blob/main/nlp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Read the contents of the file
with open('/content/Untitled document (1).txt', 'r') as file:
    data = file.read()

# Split the text into paragraphs using double newline characters as delimiters
paragraphs = data.split('\n\n')

# Create a DataFrame
df = pd.DataFrame({'Paragraphs': paragraphs})

# Print the DataFrame
df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Paragraphs
0,﻿John is a 25-year-old male who enjoys playing...
1,"\nSarah, a 30-year-old female, is active in va..."
2,"\nAlex, a 22-year-old non-binary individual, p..."
3,"\nMichael, a 40-year-old male, is an avid cycl..."
4,"\nEmily, a 28-year-old female, enjoys hiking a..."


In [3]:
from nltk.tokenize import word_tokenize

df['Tokens'] = df['Paragraphs'].apply(word_tokenize)
print(df)


                                            Paragraphs  \
0    ﻿John is a 25-year-old male who enjoys playing...   
1    \nSarah, a 30-year-old female, is active in va...   
2    \nAlex, a 22-year-old non-binary individual, p...   
3    \nMichael, a 40-year-old male, is an avid cycl...   
4    \nEmily, a 28-year-old female, enjoys hiking a...   
..                                                 ...   
425  \nWilliam, a 31-year-old wildlife tour guide, ...   
426  \nMia, a 28-year-old eco-fashion influencer, p...   
427  \nOlivia, a 38-year-old sustainability consult...   
428  \nBenjamin, a 46-year-old eco-architect, desig...   
429  \nGrace, a 61-year-old retiree, takes pleasure...   

                                                Tokens  
0    [﻿John, is, a, 25-year-old, male, who, enjoys,...  
1    [Sarah, ,, a, 30-year-old, female, ,, is, acti...  
2    [Alex, ,, a, 22-year-old, non-binary, individu...  
3    [Michael, ,, a, 40-year-old, male, ,, is, an, ...  
4    [Emily, ,, a,

In [9]:
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['Cleaned Paragraphs'] = df['Paragraphs'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_first_word(paragraph):
    # Split the paragraph into words
    words = paragraph.split()
    if len(words) > 1:
        # Remove the first word, which is assumed to be the name
        return ' '.join(words[1:])
    else:
        # If there's only one word, return an empty string
        return ''

df['Cleaned Paragraphs'] = df['Cleaned Paragraphs'].apply(remove_first_word)
print(df)

                                            Paragraphs  \
0    ﻿John is a 25-year-old male who enjoys playing...   
1    \nSarah, a 30-year-old female, is active in va...   
2    \nAlex, a 22-year-old non-binary individual, p...   
3    \nMichael, a 40-year-old male, is an avid cycl...   
4    \nEmily, a 28-year-old female, enjoys hiking a...   
..                                                 ...   
425  \nWilliam, a 31-year-old wildlife tour guide, ...   
426  \nMia, a 28-year-old eco-fashion influencer, p...   
427  \nOlivia, a 38-year-old sustainability consult...   
428  \nBenjamin, a 46-year-old eco-architect, desig...   
429  \nGrace, a 61-year-old retiree, takes pleasure...   

                                                Tokens  \
0    [﻿John, is, a, 25-year-old, male, who, enjoys,...   
1    [Sarah, ,, a, 30-year-old, female, ,, is, acti...   
2    [Alex, ,, a, 22-year-old, non-binary, individu...   
3    [Michael, ,, a, 40-year-old, male, ,, is, an, ...   
4    [Emily, 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned Paragraphs'])


In [13]:
def extract_top_keywords(tfidf_matrix, tfidf_vectorizer, paragraph, top_n=5):
    tfidf_scores = list(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix[paragraph].toarray()[0]))
    top_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:top_n]
    return [keyword[0] for keyword in top_keywords]

df['Top Keywords'] = [extract_top_keywords(tfidf_matrix, tfidf_vectorizer, i) for i in range(len(df))]
print(df)

                                            Paragraphs  \
0    ﻿John is a 25-year-old male who enjoys playing...   
1    \nSarah, a 30-year-old female, is active in va...   
2    \nAlex, a 22-year-old non-binary individual, p...   
3    \nMichael, a 40-year-old male, is an avid cycl...   
4    \nEmily, a 28-year-old female, enjoys hiking a...   
..                                                 ...   
425  \nWilliam, a 31-year-old wildlife tour guide, ...   
426  \nMia, a 28-year-old eco-fashion influencer, p...   
427  \nOlivia, a 38-year-old sustainability consult...   
428  \nBenjamin, a 46-year-old eco-architect, desig...   
429  \nGrace, a 61-year-old retiree, takes pleasure...   

                                                Tokens  \
0    [﻿John, is, a, 25-year-old, male, who, enjoys,...   
1    [Sarah, ,, a, 30-year-old, female, ,, is, acti...   
2    [Alex, ,, a, 22-year-old, non-binary, individu...   
3    [Michael, ,, a, 40-year-old, male, ,, is, an, ...   
4    [Emily, 

In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# ... (the rest of your code)

# Define a custom function to perform POS tagging
def pos_tag(text):
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    return [tag[0] for tag in tags if tag[1].startswith('VB')]
# Define a custom function to perform POS tagging
def pos_tag(text):
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    return [tag[0] for tag in tags if tag[1].startswith('VB')]  # Select words with verb POS tags

# Apply POS tagging to the cleaned paragraphs
df['Verbs'] = df['Cleaned Paragraphs'].apply(lambda text: ' '.join(pos_tag(text)))

# Create a custom TF-IDF vectorizer that considers verbs more important
custom_tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = custom_tfidf_vectorizer.fit_transform(df['Cleaned Paragraphs'])

# Extract top keywords with a focus on verbs
def extract_top_keywords_with_verbs(tfidf_matrix, tfidf_vectorizer, paragraph, top_n=5):
    tfidf_scores = list(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix[paragraph].toarray()[0]))
    top_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:top_n]

    # Filter the top keywords to include only those identified as verbs
    top_verbs_keywords = [keyword[0] for keyword in top_keywords if keyword[0] in df['Verbs'][paragraph].split()]

    return top_verbs_keywords

# Extract top keywords with a focus on verbs
df['Top Keywords with Verbs'] = [extract_top_keywords_with_verbs(tfidf_matrix, custom_tfidf_vectorizer, i) for i in range(len(df))]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [20]:
print(df)

                                            Paragraphs  \
0    ﻿John is a 25-year-old male who enjoys playing...   
1    \nSarah, a 30-year-old female, is active in va...   
2    \nAlex, a 22-year-old non-binary individual, p...   
3    \nMichael, a 40-year-old male, is an avid cycl...   
4    \nEmily, a 28-year-old female, enjoys hiking a...   
..                                                 ...   
425  \nWilliam, a 31-year-old wildlife tour guide, ...   
426  \nMia, a 28-year-old eco-fashion influencer, p...   
427  \nOlivia, a 38-year-old sustainability consult...   
428  \nBenjamin, a 46-year-old eco-architect, desig...   
429  \nGrace, a 61-year-old retiree, takes pleasure...   

                                                Tokens  \
0    [﻿John, is, a, 25-year-old, male, who, enjoys,...   
1    [Sarah, ,, a, 30-year-old, female, ,, is, acti...   
2    [Alex, ,, a, 22-year-old, non-binary, individu...   
3    [Michael, ,, a, 40-year-old, male, ,, is, an, ...   
4    [Emily, 