<a href="https://colab.research.google.com/github/JoeGoldberg09/eeee/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# One hot encoding


In [None]:
# Read three text files
filenames = ['tech1.txt', 'tech2.txt', 'tech3.txt']
texts = []
for filename in filenames:
    with open(filename, 'r', encoding='utf-8') as file:
        texts.append(file.read())

# Split texts into sentences
sentences = []
for text in texts:
    sentences.extend(text.split('.'))

sentences = [s.strip() for s in sentences if s.strip()]

# Create vocabulary of unique words
vocab = set()
for sentence in sentences:
    words = sentence.split()
    vocab.update(words)

vocab = sorted(list(vocab))  # To keep order consistent
word_to_index = {word: idx for idx, word in enumerate(vocab)}

# Create one-hot encoding for each sentence
one_hot_encodings = []
for sentence in sentences:
    encoding = [0] * len(vocab)
    words = sentence.split()
    for word in words:
        if word in word_to_index:
            encoding[word_to_index[word]] = 1
    one_hot_encodings.append(encoding)

# Display sample
print("Vocabulary:", vocab)
print("\nFirst sentence:", sentences[0])
print("One-hot encoding for first sentence:", one_hot_encodings[0])


# Bag of Words

In [None]:
# Read three text files
filenames = ['movie1.txt', 'movie2.txt', 'movie3.txt']
texts = []
for filename in filenames:
    with open(filename, 'r', encoding='utf-8') as file:
        texts.append(file.read())

# Create vocabulary
vocab = set()
for text in texts:
    words = text.split()
    vocab.update(words)

vocab = sorted(list(vocab))
word_to_index = {word: idx for idx, word in enumerate(vocab)}

# Create Bag of Words for each document
bow_matrix = []
for text in texts:
    bow_vector = [0] * len(vocab)
    words = text.split()
    for word in words:
        if word in word_to_index:
            bow_vector[word_to_index[word]] += 1
    bow_matrix.append(bow_vector)

# Display sample
print("Vocabulary:", vocab)
print("\nBag of Words matrix:")
for vector in bow_matrix:
    print(vector)


# TF-IDF


In [None]:
import math
import os
import re

def read_file(filepath):
    """Reads a file and returns its content as a string."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def tokenize(text):
    """Tokenizes the text into lowercase words, removing punctuation."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    tokens = text.split()
    return tokens

def compute_tf(tokens):
    """Computes term frequency for a list of tokens."""
    tf = {}
    total_tokens = len(tokens)
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    for word in tf:
        tf[word] /= total_tokens  # Normalize
    return tf

def compute_idf(documents):
    """Computes inverse document frequency for all tokens in all documents."""
    idf = {}
    total_documents = len(documents)
    all_tokens = set([word for tokens in documents for word in tokens])

    for word in all_tokens:
        containing_docs = sum(1 for tokens in documents if word in tokens)
        idf[word] = math.log(total_documents / (1 + containing_docs)) + 1  # +1 to avoid division by zero
    return idf

def compute_tf_idf(tf, idf):
    """Computes TF-IDF for a document."""
    tf_idf = {}
    for word, tf_value in tf.items():
        tf_idf[word] = tf_value * idf.get(word, 0)
    return tf_idf

def print_top_terms(tf_idf, top_n=10):
    """Prints top N terms with highest TF-IDF scores."""
    sorted_terms = sorted(tf_idf.items(), key=lambda item: item[1], reverse=True)
    for word, score in sorted_terms[:top_n]:
        print(f"{word}: {score:.4f}")
    print()

def main():
    # Assuming you have 3 text files: 'place1.txt', 'place2.txt', 'place3.txt'
    filepaths = ['place1.txt', 'place2.txt', 'place3.txt']

    documents = [tokenize(read_file(filepath)) for filepath in filepaths]

    # Compute TF for each document
    tfs = [compute_tf(tokens) for tokens in documents]

    # Compute IDF using all documents
    idf = compute_idf(documents)

    # Compute TF-IDF for each document
    tf_idfs = [compute_tf_idf(tf, idf) for tf in tfs]

    # Print top 10 terms per document
    for i, tf_idf in enumerate(tf_idfs):
        print(f"Top terms for Document {i+1}:")
        print_top_terms(tf_idf)

if __name__ == "__main__":
    main()


Text cleaning by removing punctuation/special characters, numbers
and extra white spaces. Use regular expression for the same.
Convert text to lowercase
Tokenization
Remove stop words
Correct misspelled words

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')

# --- Functions ---

def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation, numbers
    text = re.sub(r'\s+', ' ', text)          # Remove extra white spaces
    return text.lower()                      # Convert to lowercase

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def correct_spelling(tokens):
    corrected = []
    for word in tokens:
        blob = TextBlob(word)
        corrected.append(str(blob.correct()))
    return corrected

# --- Main Program ---

text = read_file('your_text_file.txt')
cleaned_text = clean_text(text)
tokens = tokenize(cleaned_text)
filtered_tokens = remove_stopwords(tokens)
final_tokens = correct_spelling(filtered_tokens)

print(final_tokens)


Text cleaning by removing punctuation/special characters, numbers
and extra white spaces. Use regular expression for the same.
Convert text to lowercase
Stemming and Lemmatization
Create a list of 3 consecutive words after lemmatization

In [None]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ngrams
from spellchecker import SpellChecker

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def read_file(file_path):
    """Read text from a file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def clean_text(text):
    """Remove punctuation, special characters, numbers, and extra white spaces"""
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra white spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def convert_to_lowercase(text):
    """Convert text to lowercase"""
    return text.lower()

def tokenize_text(text):
    """Tokenize text into sentences and words"""
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return sentences, words

def remove_stopwords(words):
    """Remove stop words from the list of words"""
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def correct_spelling(words):
    """Correct misspelled words"""
    spell = SpellChecker()
    corrected_words = []

    for word in words:
        # Find those words that may be misspelled
        if word in spell:
            corrected_words.append(word)
        else:
            # Get the most likely correction
            corrected_word = spell.correction(word)
            if corrected_word:
                corrected_words.append(corrected_word)
            else:
                corrected_words.append(word)

    return corrected_words

def perform_stemming(words):
    """Perform stemming on words"""
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def perform_lemmatization(words):
    """Perform lemmatization on words"""
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return lemmatized_words

def create_trigrams(words):
    """Create a list of 3 consecutive words (trigrams)"""
    return list(ngrams(words, 3))

def main():
    # File path - replace with your actual file path
    file_path = "sample_text.txt"

    # Read the file
    text = read_file(file_path)
    print("Original text (first 200 characters):")
    print(text[:200] + "...\n")

    # Task a: Text cleaning
    cleaned_text = clean_text(text)
    print("Text after cleaning (first 200 characters):")
    print(cleaned_text[:200] + "...\n")

    # Task b: Convert to lowercase
    lowercase_text = convert_to_lowercase(cleaned_text)
    print("Text after converting to lowercase (first 200 characters):")
    print(lowercase_text[:200] + "...\n")

    # Task c: Tokenization
    sentences, words = tokenize_text(lowercase_text)
    print(f"Number of sentences: {len(sentences)}")
    print(f"Number of words: {len(words)}")
    print("First 10 words:")
    print(words[:10])
    print()

    # Task d: Remove stop words
    filtered_words = remove_stopwords(words)
    print(f"Number of words after removing stop words: {len(filtered_words)}")
    print("First 10 words after removing stop words:")
    print(filtered_words[:10])
    print()

    # Task e: Correct misspelled words
    corrected_words = correct_spelling(filtered_words)
    print("First 10 words after spell correction:")
    print(corrected_words[:10])
    print()

    # Stemming
    stemmed_words = perform_stemming(corrected_words)
    print("First 10 words after stemming:")
    print(stemmed_words[:10])
    print()

    # Lemmatization
    lemmatized_words = perform_lemmatization(corrected_words)
    print("First 10 words after lemmatization:")
    print(lemmatized_words[:10])
    print()

    # Create trigrams
    trigrams = create_trigrams(lemmatized_words)
    print("First 5 trigrams after lemmatization:")
    print(trigrams[:5])

if __name__ == "__main__":
    main()
