# Laboratorium 4 - rekomendacje dla portali informacyjnych

## Przygotowanie

 * pobierz i wypakuj dataset: https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
   * więcej możesz poczytać tutaj: https://learn.microsoft.com/en-us/azure/open-datasets/dataset-microsoft-news
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab4`
 * zainstaluj potrzebne biblioteki:
 `pip install nltk sklearn`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import codecs
import string
from collections import defaultdict  # mozesz uzyc zamiast zwyklego slownika, rozwaz wplyw na czas obliczen
import math
import re
from string import punctuation
import nltk
import numpy as np
from numpy import dot
from numpy import dot
from numpy.linalg import norm

nltk.download('stopwords')
nltk.download('rslp')

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

# mozesz uzyc do obliczania najbardziej podobnych tekstow zamiast liczenia "na piechote"
# ale pamietaj o dostosowaniu formatu danych
from sklearn.neighbors import NearestNeighbors

[nltk_data] Downloading package stopwords to /home/kuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/kuba/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
# definiujemy potrzebne zmienne

PATH = './MINDsmall_train'
STOPWORDS = set(stopwords.words('english'))

In [3]:
# wczytujemy metadane artykułów

def parse_news_entry(entry):
    news_id, category, subcategory, title, abstract = entry.split('\t')[:5]
    return {
        'news_id': news_id,
        'category': category,
        'subcategory': subcategory,
        'title': title,
        'abstract': abstract
    }


def get_news_metadata():
    with codecs.open(f'{PATH}/news.tsv', 'r', 'UTF-8') as f:
        raw = [x for x in f.read().split('\n') if x]
        parsed_entries = [parse_news_entry(entry) for entry in raw]
        return {x['news_id']: x for x in parsed_entries}


news = get_news_metadata()
news_ids = sorted(list(news.keys()))
news_indices = {x[1]: x[0] for x in enumerate(news_ids)}
print(len(news))

51282


## Część 2. - TF-IDF

In [4]:
# normalizujemy teksty na potrzeby dalszego przetwarzania

def preprocess_text(text_in):
    text = text_in
    # zamieniamy wszystkie ciagi bialych znakow na pojedyncze spacje
    text = ' '.join(text.split())
    # usuwamy znaki interpunkcyjne
    text = text.translate(str.maketrans('', '', string.punctuation))
    # usuwamy wszystkie liczby
    text = ''.join([i for i in text if not i.isdigit()])
    # podmieniamy wszystkie wielkie litery
    text = text.lower()
    # dzielimy na tokeny
    tokens = text.split()
    # usuwamy stopwords
    tokens_without_stopwords = [word for word in tokens if word not in STOPWORDS]
    return tokens_without_stopwords


def stem_texts(corpus):
    stemmer = LancasterStemmer()  # przetestuj rozne stemmery

    return [[stemmer.stem(word) for word in preprocess_text(text)] for text in corpus]


texts = [news[news_id]['abstract'] for news_id in news_ids]
stemmed_texts = stem_texts(texts)

In [5]:
# porownajmy teksty przed i po przetworzeniu

print(texts[2] + '\n')
print(' '.join(stemmed_texts[2]))

"I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

think real good team team real spec good thing group clos bri schmetzer


In [105]:
# tworzymy liste wszystkich slow w korpusie

def get_all_words_sorted(corpus):
    # generujemy posortowana alfabetycznie liste wszystkich slow (tokenow)
    corpus = [token for text in corpus for token in text]
    return sorted(list(set(corpus)))


wordlist = get_all_words_sorted(stemmed_texts)
word_indices = {x[1]: x[0] for x in enumerate(wordlist)}
print(len(wordlist))

37021


In [106]:
# obliczamy liczbe tekstow, w ktorych wystapilo kazde ze slow
# pamietaj, ze jesli slowo wystapilo w danym tekscie wielokrotnie, to liczymy je tylko raz

def get_document_frequencies(corpus, wordlist):
    counter = {word:0 for word in wordlist}
    for text in corpus:
        for word in set(text):
            counter[word] = +1

    # return {word -> count}
    return counter


document_frequency = get_document_frequencies(stemmed_texts, wordlist)

In [107]:
# obliczamy liczbe wystapien kazdego slowa w kazdym tekscie

def get_term_frequencies(corpus, news_indices):
    # return {news_id -> {word -> count}}
    counter = {new_id:{} for new_id in news_indices}
    for new_idx, corpus_idx in news_indices.items():
        new = corpus[corpus_idx]
        for word in new:
            if not word in counter[new_idx]:
                counter[new_idx][word] = 0
            counter[new_idx][word] += 1

    return counter


term_frequency = get_term_frequencies(stemmed_texts, news_indices)

In [108]:
# sprawdzmy wyniki

term_frequency[news_ids[2]]

{'think': 1,
 'real': 2,
 'good': 2,
 'team': 2,
 'spec': 1,
 'thing': 1,
 'group': 1,
 'clos': 1,
 'bri': 1,
 'schmetzer': 1}

In [109]:
# obliczamy metryke tf_idf

def calculate_tf_idf(term_frequency, document_frequency, corpus_size):
    tf_idfs = {}
    n = corpus_size

    for new_id in news_indices.keys():
        tf_idfs[new_id] = {}
        for word in term_frequency[new_id].keys():
            df = document_frequency[word]
            tf = term_frequency[new_id][word]
            tf_idfs[new_id][word] = tf * math.log(n / df)

    # return {news_id -> {word -> tf_idf}}
    return tf_idfs


tf_idf = calculate_tf_idf(term_frequency, document_frequency, len(news_ids))

In [110]:
# sprawdzmy wyniki

tf_idf[news_ids[2]]

{'think': 10.845095092394073,
 'real': 21.690190184788147,
 'good': 21.690190184788147,
 'team': 21.690190184788147,
 'spec': 10.845095092394073,
 'thing': 10.845095092394073,
 'group': 10.845095092394073,
 'clos': 10.845095092394073,
 'bri': 10.845095092394073,
 'schmetzer': 10.845095092394073}

## Część 3. - Podobieństwo tekstów

In [111]:


# obliczmy odleglosc miedzy dwoma artykulami
# przetestuj rozne metryki odleglosci i wybierz najlepsza

def euclidean_metrics(new_1, new_2):
    words = list(new_1.keys())+list(new_2.keys())
    euclidean_sum = 0
    for word in words:
        x = new_1.get(word,0)
        y = new_2.get(word,0)
        euclidean_sum += (y - x) ** 2
    return math.sqrt(euclidean_sum)


def cosine_metrics(new_1, new_2):
    words = list(new_1.keys())+list(new_2.keys())
    x, y = [], []
    for word in words:
        x.append(new_1.get(word,0))
        y.append(new_2.get(word,0))
    res = dot(x, y) / (norm(x) * norm(y))
    if not res:
        return -1
    return res


def calculate_distance(tf_idf, id1, id2, metrics="cosine"):
    new_1, new_2 = tf_idf[id1], tf_idf[id2]
    if metrics == "cosine":
        return cosine_metrics(new_1, new_2)
    if metrics == "euclidean":
        return euclidean_metrics(new_1, new_2)


calculate_distance(tf_idf, news_ids[0], news_ids[1], "cosine")

-1

In [112]:
# wyznaczmy k najpodobniejszych tekstow do danego
# pamietaj o odpowiedniej kolejnosci sortowania w zaleznosci od wykorzystanej metryki
# pamietaj, zeby wsrod podobnych tekstow nie bylo danego

def get_k_most_similar_news(tf_idf, n_id, k, metrics):
    distances = []
    for new_id in filter(lambda id: id != n_id, news_indices.keys()):
        distance = calculate_distance(tf_idf, n_id, new_id, metrics)
        distances.append([new_id, distance])
    if metrics == "cosine":
        distances.sort(key=lambda x: abs(x[1] - 1))
    if metrics == "euclidean":
        distances.sort(key=lambda x: x[1])
    return list(map(lambda x: x[0], distances[:k]))


def print_k_most_similar_news(tf_idf, n_id, k, corpus, news_indices, metrics):
    similar = get_k_most_similar_news(tf_idf, n_id, k, metrics)
    print(f'id: {n_id}, text: {corpus[news_indices[n_id]]}')
    print(f'\n{k} most similar:')
    for s_id in similar:
        print(f'\nid: {s_id}, text: {corpus[news_indices[s_id]]}')


print_k_most_similar_news(tf_idf, news_ids[2], 5, texts, news_indices, "cosine")

  res = dot(x, y) / (norm(x) * norm(y))


id: N1000, text: "I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

5 most similar:

id: N10109, text: "There are an awful lot of really good journalists at Fox News Channel. It's just that they're vastly outnumbered by the opinion makers," Carl Cameron says. Julie Roginsky and Conor Powell also speak about Shep Smith's importance to the network and what his exit signifies.

id: N100, text: WASHINGTON   Somewhere over the eastern skies, the Astros assembled again as one. On an airplane to the nation's capital, the team conferred about their conundrum. A players-only meeting after Wednesday's wretched loss in Game 2 was run by two veterans who calmed any concern that cropped up. Justin Verlander and Jose Altuve emphasized encouragement. They reminded their teammates of a remarkable regular season and the prestige within the...

id: N10023, text: SCOTTSDALE, Ariz.   The Astros can fin

In [113]:

print_k_most_similar_news(tf_idf, news_ids[2], 5, texts, news_indices, "euclidean")

id: N1000, text: "I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

5 most similar:

id: N12893, text: Might be a good thing?

id: N34755, text: It really hurts to watch this team.

id: N47189, text: Not good!

id: N11497, text: Experts get real about these specs.

id: N16082, text: she had a good night
