## NLP Assignment

source of dataset: SHAKESPEARE folder texts from https://www.kaggle.com/datasets/mylesoneill/classic-literature-in-ascii

In [None]:
import os
import re
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

folder_path = "./SHAKESPEARE"
text = ""
for filename in os.listdir(folder_path):
    if filename.lower().endswith(".txt"):  # only .txt files
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
        text += content + "\n" 

In [None]:
# stopword removal
nltk.download('stopwords')
from nltk.corpus import stopwords

# Remove stopwords function for any language
def remove_stopwords(text, language):
    stop_words = set(stopwords.words(language))
    word_tokens = text.split()
    filtered_text = [word for word in word_tokens if word not in stop_words]
    print(f"Language: {language}")
    print("Filtered Text:", filtered_text)
    return filtered_text


In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

# Example usage
ex_text = "I hope this bootcamp is useful for you. You can share it with your friends at https://example.com"
remove_urls(ex_text)

In [None]:
from nltk.stem.porter import PorterStemmer

# Stemming function
stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = text.split()
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

# Example usage
ex_text = 'text preprocessing section in course nlp - deep learning'
stem_words(ex_text)

In [None]:

# Download the resource for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')


In [None]:
from nltk.stem import WordNetLemmatizer

# Lemmatization function
lemmatizer = WordNetLemmatizer()

def lemmatize_word(text):
    word_tokens = text.split()
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Example usage
ex_text = 'text preprocessing section in course nlp - deep learning'
print(lemmatize_word(ex_text))

# Text Pre Processing

In [None]:
def pre_process(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove emojis / non-ASCII
    text = text.encode('ascii', 'ignore').decode('utf-8')

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove Roman numerals
    pattern = r'\b[MCDXLVI]+\b'   # Roman numeral letters only, bounded by word boundaries
    text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)


# Text Processing for Natural Language Processing (NLP)

In [None]:
sentences_list = nltk.sent_tokenize(text)

corpus = preprocessed_sentences = [pre_process(sentence) for sentence in sentences_list]

print(corpus)

In [None]:
# Creating a Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
X_array = pd.DataFrame.sparse.from_spmatrix(X, columns=feature_names, index=corpus)

print("Unique Word List: \n", feature_names)
print()
print("Bag of Words Matrix:")
X_array

In [None]:
# Calculating Product of Term Frequency & Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
terms = tfidf_vectorizer.get_feature_names_out()
df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=terms)

df

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

word_counts = np.asarray(X.sum(axis=0)).ravel()
frequencies = dict(zip(vectorizer.get_feature_names_out(), word_counts))

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(frequencies)
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(vectorizer.vocabulary_)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Tokenize the corpus using NLTK
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]
print("Tokenized Corpus:", tokenized_corpus)

# Flatten the list to get all words in the corpus
all_words = [word for sentence in tokenized_corpus for word in sentence]

# Get unique words (vocabulary)
vocab = sorted(set(all_words))

# Print vocabulary
print("Vocabulary:", vocab)

# Reshape the list of words into a 2D array for OneHotEncoder
word_array = np.array(all_words).reshape(-1, 1)

# Apply OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=True)
one_hot_encoded = one_hot_encoder.fit_transform(word_array)

# Print the one-hot encoded data
print("One-hot encoded matrix:\n", one_hot_encoded)


# Word2vec

In [None]:
import gensim
from gensim.models import Word2Vec
sentences = sentences_list
cbow_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0, alpha=0.03, min_alpha=0.0007, epochs=100)
skipgram_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1, alpha=0.03, min_alpha=0.0007, epochs=100)

cbow_model.train(sentences, total_examples=len(sentences), epochs=100)
skipgram_model.train(sentences, total_examples=len(sentences), epochs=100)

word_vectors_cbow = cbow_model.wv
similarity_cbow = word_vectors_cbow.similarity('king', 'lord')
print(f"Similarity between 'king' and 'lord': {similarity_cbow} with CBOW")


word_vectors_skipgram= skipgram_model.wv
similarity_skip = word_vectors_skipgram.similarity('king', 'lord')
print(f"Similarity between 'king' and 'lord': {similarity_skip} with Skip-Gram")