# Out-of-core Learning

In real-world applications, it is not uncommon to have datasets bigger than the memory of the computer. So we need newer algorithms to train the model in batches of data that we have. 

In [46]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

# A tokenizer to clean and split text
def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


# Function to stream documents one at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label


print(next(stream_docs('imdb_reviews.csv')))



('"I went and saw this movie last night after being coaxed to by a few friends of mine. I\'ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."', 1)


In [47]:
from sklearn.model_selection import train_test_split
import pandas as pd
# Read all data once

df = pd.read_csv("imdb_reviews.csv")

X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.1, random_state=42
)
train_df = pd.DataFrame({"review": X_train, "sentiment": y_train})
test_df = pd.DataFrame({"review": X_test, "sentiment": y_test})

train_df.to_csv("imdb_reviews_train.csv", index=False, encoding="utf-8")
test_df.to_csv("imdb_reviews_test.csv", index=False, encoding="utf-8")


In [48]:
def get_minibatch(doc_stream, size):
    docs, y = [] , []
    try: 
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [49]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss',random_state=42)
doc_stream = stream_docs(path='imdb_reviews_train.csv')

In [50]:
import pyprind 
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:28


In [51]:
doc_stream_test = stream_docs(path='imdb_reviews_test.csv')
X_test, y_test = get_minibatch(doc_stream_test, size=5000)
X_test = vect.transform(X_test)
print(f'Accuracy: {clf.score(X_test, y_test):.3f}') 

Accuracy: 0.874


# Topic modeling with latent Dirichlet allocation

## Decomposing text documents with LDA

In [54]:
import pandas as pd
df = pd.read_csv('imdb_reviews.csv' , encoding='utf-8')
df.rename(columns={"0": "review" ,  "1": "sentiment"})

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0


In [59]:
# max_df andd max_features are 2 hyperparameters to tune. 
# first one is a threshold for the data so if the word is occured too frequlently in the doc, it will be excluded
# Second one is to limit the max_features umberr of the words that are more frequent. 
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                        max_df=.1,
                        max_features=5000)
X = count.fit_transform(df['review'].values)

In [60]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')

X_topics = lda.fit_transform(X)

In [62]:
# Now in lda we have 10 cclsters of words ech with 5000 word frequencies that shows which words are more frequent in each cluster. 
lda.components_.shape

(10, 5000)

In [None]:
# number of top words to show per topic
n_top_words = 5

feature_names = count.get_feature_names_out()


for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    top_word_indices = topic.argsort()[-n_top_words:][::-1]
    top_words = [feature_names[i] for i in top_word_indices]
    print(" ".join(top_words))


Topic 1:
horror effects budget special gore
Topic 2:
guy money worst minutes girl
Topic 3:
version action english japanese match
Topic 4:
book read game documentary feel
Topic 5:
series tv episode shows episodes
Topic 6:
family woman father mother wife
Topic 7:
music role performance musical star
Topic 8:
war police men murder action
Topic 9:
script comedy role actor performance
Topic 10:
comedy action original watched wasn
