# Extract News

In [2]:
import requests
import pandas as pd


# news api-key:
# 1270f9ec3e2a40f3a58f67bba8b716f4


def call_news_api():
  url = "https://newsapi.org/v2/everything"
  params = {
      "q": "bitcoin",
      "apiKey": "1270f9ec3e2a40f3a58f67bba8b716f4"
  }
  response = requests.get(url, params=params)
  if response.status_code == 200:
      data = response.json()
      print(data)
      articles = data.get("articles", [])

      # Convert to DataFrame
      df = pd.DataFrame(articles)

      # Save to file
      df.to_excel("news_articles.xlsx", index=False)
      df.to_csv("news_articles.csv", index=False)
      print("Data saved to news_articles.xlsx")
  else:
      print(f"Error: {response.status_code}", response.text)

call_news_api()


Data saved to news_articles.xlsx


# Assignment - 1 (Skipgram)

In [1]:
import numpy as np
import requests
import re
import random
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity

class SkipGramModel:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W1 = np.random.randn(vocab_size, embedding_dim)
        self.W2 = np.random.randn(embedding_dim, vocab_size)

    def forward(self, one_hot_vector):
        hidden_layer = np.dot(one_hot_vector, self.W1)
        output_layer = np.dot(hidden_layer, self.W2)
        output_layer = self._softmax(output_layer)
        return hidden_layer, output_layer

    def backward(self, one_hot_vector, target_vector, learning_rate=0.01):
        hidden_layer, output_layer = self.forward(one_hot_vector)
        # error = target_vector - output_layer
        error = output_layer - target_vector

        # Compute Gradients
        output_layer_gradient = np.outer(hidden_layer, error)
        hidden_layer_gradient = np.outer(one_hot_vector, np.dot(self.W2, error))

        # Update Weights
        self.W1 -= learning_rate * hidden_layer_gradient
        self.W2 -= learning_rate * output_layer_gradient

    def _softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum()



def generate_training_data(words, window_size=2):
    all_words = [word for sentence in words for word in sentence]
    word2idx = {word: idx for idx, word in enumerate(set(all_words))}
    training_pairs = []
    for sentence in words:
        for i, target_word in enumerate(sentence):
            context_range = range(max(0, i - window_size), min(len(sentence), i + window_size + 1))
            for j in context_range:
                if i != j:
                    training_pairs.append((target_word, sentence[j]))
    return training_pairs, word2idx


def word_similarity(word, word2idx, embedding_matrix, top_n=5):
    if word not in word2idx:
        print(f"Word '{word}' not in vocabulary.")
        return []
    word_vector = embedding_matrix[word2idx[word]].reshape(1, -1)
    similarities = cosine_similarity(word_vector, embedding_matrix)[0]
    similar_words = sorted(word2idx.keys(), key=lambda w: similarities[word2idx[w]], reverse=True)[1:top_n+1]
    return similar_words


def train_model(training_pairs, word2idx, vocab_size, embedding_dim=100, epochs=10):
    model = SkipGramModel(vocab_size, embedding_dim)
    loss_history = []
    for epoch in range(epochs):
        total_loss = 0
        for target_word, context_word in training_pairs:
            target_vector = np.zeros(vocab_size)
            target_vector[word2idx[target_word]] = 1

            context_vector = np.zeros(vocab_size)
            context_vector[word2idx[context_word]] = 1

            hidden, output = model.forward(target_vector)
            loss = -np.log(output[word2idx[context_word]])

            model.backward(target_vector, context_vector)
            total_loss += loss

        avg_loss = total_loss / len(training_pairs)
        loss_history.append(avg_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")
    return model, np.mean(loss_history)

def grid_search():
    df_read = pd.read_csv("news_articles.csv")
    news_data = df_read["description"].dropna().tolist()
    sample_text = [re.findall(r'\b\w+\b', desc.lower()) for desc in news_data]
    test_words = list(set(word for desc in sample_text[:10] for word in desc))

    print("\nsample-text: ", sample_text)
    print("\ntest-words: ", test_words)

    epochx = 10
    window_sizes = [2, 3, 4]
    embedding_dims = [50, 100, 200]

    best_model = None
    best_loss = float('inf')
    best_params = None

    for window_size, embedding_dim in product(window_sizes, embedding_dims):
        print(f"Training with window_size={window_size}, embedding_dim={embedding_dim}")
        training_pairs, word2idx = generate_training_data(sample_text, window_size=window_size)

        print("\ntraining-pair:S ", training_pairs)
        print("\nword2idx: ", word2idx)

        model, avg_loss = train_model(training_pairs, word2idx,
                                      vocab_size=len(word2idx), embedding_dim=embedding_dim,
                                      epochs=epochx)

        if avg_loss < best_loss:
          best_loss = avg_loss
          best_model = model
          best_params = (window_size, embedding_dim)

    print(f"\n\nBest Model: window_size={best_params[0]}, embedding_dim={best_params[1]}, AVG-Loss={best_loss}")

    # for word in word2idx:
    #     word_idx = word2idx[word]
    #     word_vector = best_model.W1[word_idx]
    #     # print(f"{word}: {word_vector}")

    # Similarity
    sample_word = random.choice(test_words)
    similar_words = word_similarity(sample_word, word2idx, best_model.W1)
    print(f"Words similar to '{sample_word}': {similar_words}")

if __name__ == "__main__":
    grid_search()


NameError: name 'pd' is not defined