In [None]:
import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torchmetrics

import pandas as pd
from tqdm.notebook import trange, tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
import gensim.downloader
from sklearn.linear_model import LinearRegression

device = 'cpu'

In [3]:
df = pd.read_json('gutenberg-dataset-v2.json')
embedding_vectors = gensim.downloader.load('glove-twitter-25')

embedding_dim = len(embedding_vectors['test'])
chunk_size = 2000
input_size = embedding_dim * chunk_size

In [4]:
tqdm.pandas()

def embed_text(text):
    words = text.split()
    if len(words) != chunk_size:
        print("error wrong word count")
    embeddings = []

    for word in words: 
        if word not in embedding_vectors:
            embeddings.append(np.zeros(embedding_dim))
        else:
            e = embedding_vectors[word]
            assert(len(e) == embedding_dim)
            embeddings.append(embedding_vectors[word])

    return np.concatenate(embeddings)

        

# df['text_embed'] = df['text'].progress_apply(embed_texts)
# df['text_embed'] = df['text'].progress_apply(lambda text: np.array([embedding_vectors[word] if word in embedding_vectors else embedding_vectors['none'] for word in text.split() if word in embedding_vectors]).flatten())


In [8]:
vocabulary = set()
for row in tqdm(df.iterrows(), total=len(df)):
    text = row[1]['text']
    words = text.split()
    for word in words:
        vocabulary.add(word)

len(vocabulary)

  0%|          | 0/120162 [00:00<?, ?it/s]

1444188

In [9]:
frequencies = {word: 0 for word in vocabulary}
for row in tqdm(df.iterrows(), total=len(df)):
    text = row[1]['text']
    words = text.split()
    for word in words:
        frequencies[word] = frequencies[word] + 1


  0%|          | 0/120162 [00:00<?, ?it/s]

In [39]:
len(frequencies)

frequencies2 = {}
vocabulary2 = []

vocab2set = set()

for word in tqdm(vocabulary):
    f = frequencies[word]
    if f > 200:
        frequencies2[word] = f
        vocabulary2.append(word)
        vocab2set.add(word)


  0%|          | 0/1444188 [00:00<?, ?it/s]

In [51]:
len(frequencies2)

Unnamed: 0,title,author,date,text_ratio,text,text_len_characters,weights,word_frequencies
273901,"Historical Record of the Thirty-fourth, or the...",Richard Cannon,1702,0.944455,in numbers to three thousand five hundred menr...,12139,0.111111,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
273902,"Historical Record of the Thirty-fourth, or the...",Richard Cannon,1702,0.944455,joseph bonaparte titular king of spain at tala...,12133,0.111111,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
129008,The Memorable Thoughts of Socrates,Xenophon,1702,0.963115,in war only to secure their own quiet and choo...,10511,0.047619,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
273905,"Historical Record of the Thirty-fourth, or the...",Richard Cannon,1702,0.944455,the honorable henry seymour conway appointed t...,11779,0.111111,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
273899,"Historical Record of the Thirty-fourth, or the...",Richard Cannon,1702,0.944455,improvements were eventually introduced in the...,11914,0.111111,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
306351,How music grew,Marion Bauer,2023,0.926967,plain song and airs from the cloister and yet ...,10948,0.016667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
306335,How music grew,Marion Bauer,2023,0.926967,song and they brought over to britain all thei...,11399,0.016667,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
306365,How music grew,Marion Bauer,2023,0.926967,writing in that work which is typical of mende...,11409,0.016667,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
306333,How music grew,Marion Bauer,2023,0.926967,soon the composers made this melody in the med...,10598,0.016667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [50]:
def count_word_frequencies(text):
    frequencies = {word: 0 for word in vocabulary2}
    for word in text.split():
        if word in vocab2set:
            frequencies[word] = frequencies[word] + 1

    list = []
    for word in vocabulary2:
        list.append(frequencies[word])

    return np.array(list)


df['word_frequencies'] = df['text'].progress_apply(count_word_frequencies)

  0%|          | 0/120162 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
df['word_frequencies'] = df['word_frequencies'].apply(lambda x: np.array(x))

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df['word_frequencies'].values, df['date'].values, test_size=0.2, random_state=42, shuffle=True)

In [49]:
X_train[1]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [46]:
reg = LinearRegression(n_jobs=-1).fit(X_train, y_train)




ValueError: setting an array element with a sequence.

In [43]:
num_epochs = 20000
loading_steps = 10
loading_count = 0

learning_rate = 0.01

input_size = X_train.shape[1]
model = LinearRegressionModel(input_size)
model.to(device)

criterion = nn.MSELoss()  
#criterion = nn.L1Loss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device).view(-1, 1)

for epoch in range(num_epochs):
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % (num_epochs / loading_steps) == 0:
        print(f'{loading_count}0% done')
        print(f'Loss: {loss}')
        loading_count = loading_count + 1

print('finished')


273901    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
273902    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
129008    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
273905    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
273899    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
                                ...                        
306351    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
306335    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
306365    [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
306333    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
306381    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: word_frequencies, Length: 120162, dtype: object

In [6]:
class WordFreqModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__() 
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU() # probably dont need a 2nd relu
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# tensor_train = torch.tensor([df_train['text_embed'].values, df_train['date'].values])
# tensor_train.to(device)

# tensor_test = torch.tensor(df_test)
# tensor_test.to(device)

In [8]:
train_dataloader = DataLoader(df_train, batch_size=16)
test_dataloader = DataLoader(df_test, batch_size=16)

In [9]:
model = WordFreqModel(input_dim=input_size, hidden_dim=input_size * 2, output_dim=1).to(device)
optimizer = optim.Adagrad(model.parameters(), lr=0.025)
loss_function = nn.MSELoss()


In [None]:
train_dataloader

In [9]:
# training
#model.train()
total_loss = 0

for batch, data in enumerate(train_dataloader):
    text = data['text']
    label = data['date']

    print(text)
    print(label)
    break

    #optimizer.zero_grad()


    outputs = model(text)
            
            
    loss = loss_function(outputs, target)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

    pass    

averaged_loss = total_loss / (batch + 1) #TODO
print(f"Loss: {averaged_loss}")

KeyError: 0