In [None]:
# text_prediction.py

import numpy as np
import random
from collections import defaultdict
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.corpus import gutenberg

nltk.download('gutenberg')

# ---------- Traditional ML: Markov Chain (n-gram) ----------
class MarkovChainPredictor:
    def __init__(self, n=2):
        self.n = n
        self.model = defaultdict(list)

    def train(self, text):
        tokens = text.split()
        for i in range(len(tokens) - self.n):
            key = tuple(tokens[i:i+self.n])
            self.model[key].append(tokens[i+self.n])
        return tokens  # Return tokens for evaluation

    def predict(self, seed_text, num_words=10):
        tokens = seed_text.split()
        result = tokens[:]
        for _ in range(num_words):
            key = tuple(result[-self.n:])
            next_word = random.choice(self.model.get(key, [""]))
            result.append(next_word)
        return " ".join(result)

    def evaluate_accuracy(self, tokens, num_predictions=1000):
        correct = 0
        total = 0
        for i in range(min(num_predictions, len(tokens) - self.n)):
            key = tuple(tokens[i:i+self.n])
            actual_next = tokens[i+self.n]
            predicted_next = random.choice(self.model.get(key, [""]))
            if predicted_next == actual_next:
                correct += 1
            total += 1
        return correct / total if total > 0 else 0

# ---------- Deep Learning: LSTM ----------
class LSTMTextPredictor:
    def __init__(self, vocab_size=10000, max_len=20):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.max_len = max_len
        self.model = Sequential([
            Embedding(vocab_size, 128),
            LSTM(128),
            Dense(vocab_size, activation='softmax')
        ])
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    def train(self, text):
        lines = text.split('\n')
        self.tokenizer.fit_on_texts(lines)
        sequences = []
        for line in lines:
            tokens = self.tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(tokens)):
                seq = tokens[:i+1]
                sequences.append(seq)
        sequences = pad_sequences(sequences, maxlen=self.max_len, padding='pre')
        X, y = sequences[:, :-1], sequences[:, -1]
        history = self.model.fit(X, y, epochs=5, verbose=0)
        return history.history['accuracy'][-1]  # Return final training accuracy

    def predict(self, seed_text, num_words=10):
        for _ in range(num_words):
            token_seq = self.tokenizer.texts_to_sequences([seed_text])[0]
            token_seq = pad_sequences([token_seq], maxlen=self.max_len-1, padding='pre')
            pred_id = np.argmax(self.model.predict(token_seq, verbose=0))
            word = self.tokenizer.index_word.get(pred_id, "")
            seed_text += ' ' + word
        return seed_text

# ---------- Test ----------
if __name__ == "__main__":
    # Use a subset of the Gutenberg corpus as training data
    # Get a list of all available books
    book_ids = gutenberg.fileids()
    # Use just one book for training (e.g., 'austen-emma.txt')
    sample_text = ' '.join(gutenberg.words('austen-emma.txt'))
    
    print("=== Markov Chain Prediction ===")
    mc = MarkovChainPredictor(n=2)
    tokens = mc.train(sample_text)
    print("Sample prediction:", mc.predict("I like", 5))
    mc_accuracy = mc.evaluate_accuracy(tokens)
    print(f"Markov Chain Accuracy: {mc_accuracy:.2%}")
    
    print("\n=== LSTM Prediction ===")
    lstm = LSTMTextPredictor(vocab_size=5000, max_len=10)  # Reduced vocab size and sequence length
    lstm_accuracy = lstm.train(sample_text)
    print("Sample prediction:", lstm.predict("I like", 5))
    print(f"LSTM Training Accuracy: {lstm_accuracy:.2%}")


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\canet\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


=== Markov Chain Prediction ===
Sample prediction: I like to talk to matters of
Markov Chain Accuracy: 30.60%

=== LSTM Prediction ===


MemoryError: 

In [None]:
# sentiment_classification.py

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

# Sample data
texts = [' '.join(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]
labels = [1 if fileid.startswith('pos') else 0 for fileid in movie_reviews.fileids()]

test_samples = ["I really liked this movie", "This was the worst experience"]

# ---------- Traditional ML: Naive Bayes ----------
def traditional_sentiment_classification():
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)

    print("=== Naive Bayes Classification ===")
    print(f"Accuracy: {acc:.2%}")

    test_vec = vectorizer.transform(test_samples)
    test_preds = model.predict(test_vec)
    for text, label in zip(test_samples, test_preds):
        print(f"'{text}' -> {'Positive' if label == 1 else 'Negative'}")

# ---------- Deep Learning: LSTM ----------
def deep_sentiment_classification():
    tokenizer = Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(texts)
    X = tokenizer.texts_to_sequences(texts)
    X = pad_sequences(X, maxlen=10)
    y = np.array(labels)

    model = Sequential([
        Embedding(input_dim=1000, output_dim=64, input_length=10),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X, y, epochs=10, verbose=0)

    test_seq = tokenizer.texts_to_sequences(test_samples)
    test_seq = pad_sequences(test_seq, maxlen=10)
    predictions = model.predict(test_seq)

    print("\n=== LSTM Sentiment Predictions ===")
    for text, pred in zip(test_samples, predictions):
        print(f"'{text}' -> {'Positive' if pred[0] > 0.5 else 'Negative'}")

    # Evaluate the LSTM model on the test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    lstm_acc = model.evaluate(X_test, y_test)[1]
    print(f"LSTM Accuracy: {lstm_acc:.2%}")

# ---------- Run Both ----------
if __name__ == "__main__":
    traditional_sentiment_classification()
    deep_sentiment_classification()


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\canet\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


=== Naive Bayes Classification ===
Accuracy: 81.50%
'I really liked this movie' -> Negative
'This was the worst experience' -> Negative


