In [1]:
!pip install nltk scikit-learn bertopic sentence-transformers


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp311-cp311-win_amd64.whl.metadata (15 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
Downloading hdbscan-0.8.39-cp311-cp311-win_amd64.whl (728 kB)
   ---------------------------------------- 0.0/728.8 kB ? eta -:--:--
   ---------------------------------------- 728.8/728.8 kB 5.0 MB/s eta 0:00:00
Installing collected packages: hdbscan, bertopic
Successfully installed bertopic-0.16.4 hdbscan-0.8.39



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ranah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-

True

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer


In [4]:
def clean_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [5]:
def pos_tagging(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags  # Returns a list of tuples with (word, POS tag)


In [6]:
def named_entity_recognition(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    named_entities = ne_chunk(pos_tags)
    return named_entities  # Returns a tree with named entities


In [7]:
def calculate_tfidf(corpus):
    vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


In [17]:
import pandas as pd 

df=pd.read_csv('tweets\\2024-11-07_20-20-07_tweets_1-603.csv')

tweets=df['Content']

# Cleaned tweets
cleaned_tweets = [clean_text(tweet) for tweet in tweets]

# POS tagging
pos_tagged_tweets = [pos_tagging(tweet) for tweet in cleaned_tweets]

# Named entity recognition
ner_tweets = [named_entity_recognition(tweet) for tweet in cleaned_tweets]

# Calculate TF-IDF
tfidf_matrix, feature_names = calculate_tfidf(cleaned_tweets)

print("Cleaned Tweets:", cleaned_tweets)
print("POS Tagged:", pos_tagged_tweets)
print("Named Entities:", ner_tweets)
print("TF-IDF Matrix:", tfidf_matrix.toarray())


TF-IDF Matrix: [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.25821492 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
