In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertModel
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [5]:
news_data = pd.read_csv('guardian_articles.csv')
news_data.head(5)


Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,us-news/2016/jan/31/iowa-caucus-underdog-candi...,US news,Iowa underdogs put on brave faces despite all ...,https://www.theguardian.com/us-news/2016/jan/3...,As polling day looms and the cameras turn only...,2016-01-31T23:53:37Z,1
1,us-news/2016/jan/31/iowa-caucus-worlds-most-pa...,US news,Iowa caucus: hologram eagle and Jesus star on ...,https://www.theguardian.com/us-news/2016/jan/3...,"In Des Moines on Sunday, the Guardian was give...",2016-01-31T23:46:28Z,2
2,world/2016/jan/31/tanzania-britsh-helicopter-p...,World news,British pilot in Tanzania 'manoeuvred ​to save...,https://www.theguardian.com/world/2016/jan/31/...,A British pilot who was shot dead by an elepha...,2016-01-31T23:43:48Z,3
3,football/2016/jan/31/late-winner-gets-usa-off-...,Football,USA 3-2 Iceland | International friendly match...,https://www.theguardian.com/football/2016/jan/...,USA took a step toward shaking off the ghosts ...,2016-01-31T23:30:49Z,4
4,football/2016/jan/31/blackburn-paul-lambert-ox...,Football,Reinvigorated Paul Lambert reflects after impr...,https://www.theguardian.com/football/2016/jan/...,"The clean-shaven, spectacle free and suspiciou...",2016-01-31T22:30:10Z,5


In [6]:
news_data.describe()

Unnamed: 0,id
count,149839.0
mean,74920.0
std,43254.93783
min,1.0
25%,37460.5
50%,74920.0
75%,112379.5
max,149839.0


In [7]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149839 entries, 0 to 149838
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   article_id          149839 non-null  object
 1   sectionName         149839 non-null  object
 2   webTitle            149839 non-null  object
 3   webUrl              149839 non-null  object
 4   bodyContent         148731 non-null  object
 5   webPublicationDate  149839 non-null  object
 6   id                  149839 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 8.0+ MB


In [10]:
news_data.drop(columns=["article_id","webUrl","webPublicationDate","id"],inplace=True)

In [22]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

In [29]:
def remove_single_letters(sentence):
    words = sentence.split()
    filtered_words = [word for word in words if len(word) > 1 or not word.isalpha()]
    return ' '.join(filtered_words)

def preprocess(text):
    tokens = word_tokenize(remove_single_letters(text.lower()))
    filtered_tokens = [stemmer.stem(token) for token in tokens if (token not in stop_words and token.isalpha())]
    return filtered_tokens

num_sections = news_data['sectionName'].nunique()
samples_per_group = 20000 // num_sections

sampled_news_data = news_data.groupby('sectionName').apply(lambda x: x.sample(min(samples_per_group, len(x)))).reset_index(drop=True)


sampled_news_data=sampled_news_data[sampled_news_data['bodyContent'].apply(lambda x: not isinstance(x, float))]
sampled_news_data['tokens'] = sampled_news_data['bodyContent'].apply(preprocess)

In [30]:
sampled_news_data.head(5)

Unnamed: 0,sectionName,webTitle,bodyContent,tokens
0,A time for Japan,From polo shirts to retro wristwatches: how to...,Remember when Mad Men started airing on televi...,"[rememb, mad, men, start, air, televis, indoor..."
1,A vision for better food,‘Cooking is at the centre of life’: food entre...,For a man at the head of a food empire whose a...,"[man, head, food, empir, whose, annual, sale, ..."
2,AMC+: Only the good stuff,What TV series should you watch next? Take our...,There’s nothing better than settling in with a...,"[noth, better, settl, excel, new, tv, seri, es..."
3,About,Guardian and Observer style guide: A,A B C D E F G H I J K L M N O P Q R S T U V W ...,"[h, use, silent, h, heir, hour, honest, politi..."
4,Amazon Prime Video: Truth Seekers,Scarily funny: Frost and Pegg return to comedy...,British writer-actors Nick Frost and Simon Peg...,"[british, nick, frost, simon, pegg, fear, dedi..."


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

class AutoencoderModule(nn.Module):
    def __init__(self, input_dim, latent_dim, activation):
        super(AutoencoderModule, self).__init__()
        if activation == 'relu':
            self.activation = nn.ReLU()
        # Extend for other activations if needed
        
        self.encoder = nn.Linear(input_dim, latent_dim)
        self.decoder = nn.Linear(latent_dim, input_dim)
        
    def forward(self, x):
        x = self.activation(self.encoder(x))
        x = self.activation(self.decoder(x))
        return x

class Autoencoder:
    def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
        self.latent_dim = latent_dim
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.loss_fn = nn.MSELoss()
        self.optimizer = None
        self.his = []

    def fit(self, X):
        if self.model is None:
            self.model = AutoencoderModule(X.shape[1], self.latent_dim, self.activation)
            self.optimizer = optim.Adam(self.model.parameters())
        
        dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32))
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        X_train, X_test = train_test_split(X, test_size=0.2)
        
        train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32))
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32))
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
        
        for epoch in range(self.epochs):
            for batch in train_loader:
                inputs = batch[0]
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, inputs)
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
            # Validation loss
            with torch.no_grad():
                val_loss = sum(self.loss_fn(self.model(inputs), inputs) for inputs, in test_loader)
                self.his.append(val_loss / len(test_loader))
