<a href="https://colab.research.google.com/github/MichalMSlusarski/ML-semestr-letni/blob/main/lda_test_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download pl_core_news_sm

In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora
import string
import spacy
# import en_core_web_sm #??

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
import re 


# Constants

In [3]:
data = pd.read_csv('articles_reduced.csv')
nlp = spacy.load("pl_core_news_sm")

PUNCTUATION = '''!()-[]{};:'"\,<>./?@#$%^&*_~“”’–„'''
STOP_WORDS = ['moment', 'chodzić', 'robić', 'wrażenie', 'faktycznie', 'myśleć', 'chcieć', 'czas', 'zrobić', 'wiedzieć', 'trochę', 'swój', 'super', 'naprawdę','prosty', 'drugi', 'przykład', 'troszeczkę', 'raczej', 'siebie', 'wydawać','rzecz', 'chyba', 'różny', 'a',	'aby',	'ach',	'acz',	'aczkolwiek',	'aj',	'albo',	'ale',	'ależ',	'ani',	'aż',	'bardziej',	'bardzo',	'bez',	'bo',	'bowiem',	'by',	'byli',	'bym',	'bynajmniej',	'być',	'był',	'była',	'było',	'były',	'będzie',	'będą',	'cali',	'cała',	'cały',	'chce',	'choć',	'ci',	'ciebie',	'cię',	'co',	'cokolwiek',	'coraz',	'coś',	'czasami',	'czasem',	'czemu',	'czy',	'czyli',	'często',	'daleko',	'dla',	'dlaczego',	'dlatego',	'do',	'dobrze',	'dokąd',	'dość',	'dr',	'dużo',	'dwa',	'dwaj',	'dwie',	'dwoje',	'dzisiaj',	'dziś',	'gdy',	'gdyby',	'gdyż',	'gdzie',	'gdziekolwiek',	'gdzieś',	'go',	'godz',	'hab',	'i',	'ich',	'ii',	'iii',	'ile',	'im',	'inna',	'inne',	'inny',	'innych',	'inż',	'iv',	'ix',	'iż',	'ja',	'jak',	'jakaś',	'jakby',	'jaki',	'jakichś',	'jakie',	'jakiś',	'jakiż',	'jakkolwiek',	'jako',	'jakoś',	'je',	'jeden',	'jedna',	'jednak',	'jednakże',	'jedno',	'jednym',	'jedynie',	'jego',	'jej',	'jemu',	'jest',	'jestem',	'jeszcze',	'jeśli',	'jeżeli',	'już',	'ją',	'każdy',	'kiedy',	'kierunku',	'kilka',	'kilku',	'kimś',	'kto',	'ktokolwiek',	'ktoś',	'która',	'które',	'którego',	'której',	'który',	'których',	'którym',	'którzy',	'ku',	'lat',	'lecz',	'lub',	'ma',	'mają',	'mam',	'mamy',	'mało',	'mgr',	'mi',	'miał',	'mimo',	'między',	'mnie',	'mną',	'mogą',	'moi',	'moim',	'moja',	'moje','móc', 'mieć',	'może',	'możliwe',	'można',	'mu',	'musi',	'my',	'mój',	'na',	'nad',	'nam',	'nami',	'nas',	'nasi',	'nasz',	'nasza',	'nasze',	'naszego',	'naszych',	'natomiast',	'natychmiast',	'nawet',	'nic',	'nich',	'nie',	'niech',	'niego',	'niej',	'niemu',	'nigdy',	'nim',	'nimi',	'nią',	'niż',	'no',	'nowe',	'np',	'nr',	'o',	'o.o.',	'obok',	'od',	'ok',	'około',	'on',	'ona',	'one',	'oni',	'ono',	'oraz',	'oto',	'owszem',	'pan',	'pana',	'pani',	'pl',	'po',	'pod',	'podczas',	'pomimo',	'ponad',	'ponieważ',	'powinien',	'powinna',	'powinni',	'powinno',	'poza',	'prawie',	'prof',	'przecież',	'przed',	'przede',	'przedtem',	'przez',	'przy',	'raz',	'razie',	'roku',	'również',	'sam',	'sama',	'się',	'skąd',	'sobie',	'sobą',	'sposób',	'swoje',	'są',	'ta',	'tak',	'taka',	'taki',	'takich',	'takie',	'także',	'tam',	'te',	'tego',	'tej',	'tel',	'temu',	'ten',	'teraz',	'też',	'to',	'tobie',	'tobą',	'toteż',	'totobą',	'trzeba',	'tu',	'tutaj',	'twoi',	'twoim',	'twoja',	'twoje',	'twym',	'twój',	'ty',	'tych',	'tylko',	'tym',	'tys',	'tzw',	'tę',	'u',	'ul',	'vi',	'vii',	'viii',	'vol',	'w',	'wam',	'wami',	'was',	'wasi',	'wasz',	'wasza',	'wasze',	'we',	'według',	'wie',	'wiele',	'wielu',	'więc',	'więcej',	'wszyscy',	'wszystkich',	'wszystkie',	'wszystkim',	'wszystko',	'wtedy',	'www',	'wy',	'właśnie',	'wśród',	'xi',	'xii',	'xiii',	'xiv',	'xv',	'z',	'za',	'zapewne',	'zawsze',	'zaś',	'ze',	'zeznowu',	'znowu',	'znów',	'został',	'zł',	'żaden',	'żadna',	'żadne',	'żadnych',	'że',	'żeby']

try:
  stopwords_file = '/content/polish.stopwords.txt'
  with open(stopwords_file, 'r') as f:
    stop_words = f.read().splitlines()
  STOP_WORDS.extend(stop_words)
except:
  pass

lemma = WordNetLemmatizer()

# Functions

In [61]:
def prepare_sentences(text):
    sentences = [] 
    for sentence in text.replace('\n','').split('.'):
        if len(sentence) > 10:
            sentences.append(sentence.strip())
    return sentences

def clean(doc):
    
    words = " ".join([i for i in doc.lower().split()])
    # words = words.translate(str.maketrans('', '', string.punctuation))
    words = nlp(words)
    
    normalized_list = []
    for word in words:
        lemma = lemmatize_word(word)
        if lemma not in STOP_WORDS and lemma not in PUNCTUATION and ' ' not in lemma and lemma:
            normalized_list.append(lemma)

    normalized = ' '.join(normalized_list)  

    # print("NORMALIZED: " + normalized) #testuj tu cos                      
    return normalized #zwraca znormalizwane zdanie po zdaniu 

def lemmatize_word(word):
    lemma = word.lemma_
    if lemma.strip().lower():
        return lemma
    
def get_lda(text, topics, words):
    doc_complete = prepare_sentences(text)
    
    doc_clean = [clean(doc).split() for doc in doc_complete] 

    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel

    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=topics, id2word = dictionary, passes=50)
    lda = ldamodel.print_topics(num_topics=topics, num_words=words)

    return lda

def topic_modeling(data, content_col, label_col):

    topic_list = []
    labels = []

    # Iterating through content with topic modeling
    for n,article in enumerate(tqdm(data[content_col])):
        if type(article) == str and 'sport' not in article.lower():
            try:
                topics = get_lda(article, 3, 10)
                topic_list.append(topics)

                labels.append(list(data[label_col])[n])
            except:
                pass
            
    return [topic_list, labels]

def generate_keywords(topic_list):
    keywords = []

    # Extract the topic keywords without the likelihoods
    for t_list in topic_list:
        keywords.append([fragment for tup in t_list for fragment in re.findall(r'"([^"]*)"', tup[1])])
        
    return keywords


def generate_similarity_matrix(document_list): 
    global vectorizer 

    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(document_list)

    # Calculate the cosine similarity matrix
    cosine_similarities = cosine_similarity(tfidf_matrix)
    
    return cosine_similarities
    
def generate_similarity_rank(cosine_similarities, labels, min_strength):
    data = pd.DataFrame()
    # Print the cosine similarity matrix
    for i in tqdm(range(len(cosine_similarities))):
        for j in range(i + 1, len(cosine_similarities)):
            # print(f"Similarity between list {i} and list {j}: {cosine_similarities[i][j]}")
            tmp = pd.DataFrame()
            if cosine_similarities[i][j] > min_strength and labels[i] != labels[j]:
                tmp['content_1'] = [labels[i]]
                tmp['content_2'] = [labels[j]]
                tmp['similarity'] = [cosine_similarities[i][j]]
                data = data.append(tmp)
                
    data = data.sort_values(by=['similarity'], ascending = False)

    return data

def find_similar_content(cosine_similarities, labels, content, min_strength):
    data = pd.DataFrame()
    
    for i in range(len(cosine_similarities)):

        # Temporary dataframe 
        tmp = pd.DataFrame()

        for j in range(i + 1, len(cosine_similarities)):

            if cosine_similarities[i][j] > min_strength and labels[i] == content:
                tmp['content_1'] = [labels[i]]
                tmp['content_2'] = [labels[j]]
                tmp['similarity'] = [cosine_similarities[i][j]]

                data = data.append(tmp)

    data = data.sort_values(by = ['similarity'], ascending = False)
    
    return data

def cluster_data(document_list, epsilon, min):
    # Fit and transform the vectorizer on the document list
    vectorized_docs = vectorizer.fit_transform(document_list)

    # Apply dimensionality reduction to reduce the vectorized documents to two or three dimensions
    pca = PCA(n_components=2)  # or n_components=3 for 3D visualization
    reduced_docs = pca.fit_transform(vectorized_docs.toarray())

    # Perform K-means clustering
    #kmeans = KMeans(n_clusters=min)
    #cluster_labels = kmeans.fit_predict(reduced_docs)

    dbscan = DBSCAN(eps=epsilon, min_samples=min)  # Adjust the values of eps and min_samples as needed
    cluster_labels = dbscan.fit_predict(reduced_docs)
    #cluster_labels = np.unique(cluster_labels)
    
    return [reduced_docs, cluster_labels]

def generate_clustered_table(reduced_docs, labels, cluster_labels):
    data = pd.DataFrame()

    for n, loc in enumerate(reduced_docs):
        tmp = pd.DataFrame()
        
        loc = str(loc).replace('[','').replace(']','').strip().replace('  ', ' ').replace('  ', ' ')
        tmp['x'] = [loc.split(' ')[0]]
        tmp['y'] = [loc.split(' ')[1]]
        tmp['content'] = [labels[n]]
        tmp['topic'] = [cluster_labels[n]]
        
        data = data.append(tmp)
    
    data = data.sort_values(by=['topic'])
    return data

def draw_viz(reduced_docs, cluster_labels, title):
    # Visualize the clustered data
    plt.style.use('dark_background')

    fig, ax = plt.subplots(figsize=(10, 10))
    scatter = ax.scatter(reduced_docs[:, 0], reduced_docs[:, 1], c=cluster_labels, cmap='Set3')

    plt.title(title)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.colorbar(scatter)

    plt.show()

def draw_viz_with_topics(reduced_docs, cluster_labels, keywords):
    plt.style.use('dark_background')

    fig, ax = plt.subplots(figsize=(70, 70))  # Adjust the figsize as per your desired resolution

    scatter = ax.scatter(reduced_docs[:, 0], reduced_docs[:, 1], c=cluster_labels, cmap='Set2')

    for i, txt in enumerate(keywords):
        ax.annotate(txt[0:25], (reduced_docs[i, 0], reduced_docs[i, 1]), fontsize=8)

    plt.title('Clustered Documents with First Keyword')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.colorbar(scatter)

    plt.show()

def clean_keywords(keywords):
    local_stopwords = ['the', 'rok', 'Rok', 'The']
    cleaned_keywords = []
    for keyword_list in keywords:
        cleaned_list = []
        for keyword in keyword_list:
            cleaned_keyword = re.sub(r'[^a-zA-Z0-9ąćęłńóśźżĄĆĘŁŃÓŚŹŻ\s]', '', keyword)
            if cleaned_keyword not in local_stopwords:
                cleaned_list.append(cleaned_keyword)
        cleaned_keywords.append(cleaned_list)
    return cleaned_keywords

# Methodology tests

Preparing data and variables

In [None]:
output = topic_modeling(data, 'content', 'title')
topic_list, labels = output[0], output[1]
#labels == titles

In [None]:
print(labels)

In [50]:
keywords = generate_keywords(topic_list)
print(keywords) 
# Convert each list of words into a string
document_list = [' '.join(words) for words in keywords]
sim_matrix = generate_similarity_matrix(document_list)

output = cluster_data(document_list, 6, 6)
reduced_docs, cluster_labels = output[0], output[1]

[['rok', 'wizja', 'frank', 'warner', 'kino', 'życie', 'młody', 'świat', 'kult', 'mieszkać', 'dom', 'victory', 'miasteczko', 'ciekawy', 'czysty', 'Silberman', 'poważny', 'kochanie', 'martwić', 'amerykański', 'martwić', 'film', 'kochanie', 'alica', 'pugh', 'scena', 'drążyć', 'wiadomo', 'żona', 'dobry'], ['sierpień', 'festival', 'hop', 'hip', '13', 'quebonafide', 'grabowski', 'polish', 'weekend', 'cztery', 'trasa', 'wrzesień', 'quebo', 'metr', '4', '2', 'koncert', 'odbyć', 'zielonka', 'gig', 'Polska', 'grabowski', 'kuba', 'zaopatrzyć', 'wrześnie', 'samochodowy', 'musieć', 'mapa', 'imieniny', 'pic'], ['niejako', 'scena', 'pewien', 'płyta', 'soma', 'pokryć', 'tysiąc', 'późno', 'niebotyczny', 'osiągnąć', 'rok', 'taco', 'tour', 'quebo', 'trasa', 'kolejny', 'will', 'zostać', 'pierwszy', 'urokliwy', 'taconafide', 'skoro', 'temat', 'duet', 'psycho', 'relations', 'muzyczny', 'polski', 'dzień', 'trasa'], ['wilka', 'hemp', 'gru', 'molesta', 'rap', 'berson', 'styl', 'stać', 'zostać', 'tekst', 'rok',

Creating similarity rank

In [None]:
sim_rank = generate_similarity_rank(sim_matrix, labels, 0.2)
sim_rank

Find similar content

In [None]:
title = 'Nie okazuj emocji i nie proś o pomoc. O stereotypach dotyczących męskiego zdrowia psychicznego (ROZMOWA)'
similars = find_similar_content(sim_matrix, labels, title, 0.1)
similars

Clustered table

In [None]:
clustered_table = generate_clustered_table(reduced_docs, labels, cluster_labels)
clustered_table

Clustered viz

In [None]:
#draw_viz(reduced_docs, cluster_labels)
keywords = clean_keywords(keywords)
#keywords = [''.join(words) for words in keywords]
print(keywords)

In [None]:
draw_viz_with_topics(reduced_docs, cluster_labels, keywords)

#Keywords exploratory analysis


In [None]:
#Top keywords for each cluster
import itertools
from collections import Counter

def aggregate_top_keywords(keywords, cluster_labels, top_n=10):
    cluster_keywords = {}

    for cluster_label in set(cluster_labels):
        # Get the keywords associated with the current cluster label
        cluster_keywords_list = [kw for kw, lbl in zip(keywords, cluster_labels) if lbl == cluster_label]

        # Flatten the list of keywords
        flattened_keywords = list(itertools.chain.from_iterable(cluster_keywords_list))
        keyword_counts = Counter(flattened_keywords)
        top_keywords = keyword_counts.most_common(top_n)
        cluster_keywords[cluster_label] = top_keywords

    return cluster_keywords

def print_top_keywords_for_each_cluster(cluster_labels):
    keyword_data = []
    top_keywords_per_cluster = aggregate_top_keywords(keywords, cluster_labels, top_n=20)

    # Print the top keywords for each cluster
    for cluster_label, top_keywords in top_keywords_per_cluster.items():
        print(f"Cluster {cluster_label}:")
        keyword_data.append([f'CLUSTER {cluster_label}', 0, 0])
        for keyword, count in top_keywords:
            print(f"- {keyword} ({count} occurrences)")
            of_all = count / 22962 #hardcoded number of all keywords in all clusters
            keyword_data.append([keyword, count, of_all])
        print()
    df = pd.DataFrame(keyword_data, columns=['keyword', 'count', 'of_all'])
    df.to_csv(f'keywords_from_6_clusters.csv', index=False)
    #keyword_data.clear()

print("TOP KEYWORDS FROM " + str(i) + " CLUSTERS:")
print_top_keywords_for_each_cluster(cluster_labels)


In [53]:
from collections import defaultdict

def get_cluster_keywords(cleaned_keywords, cluster_labels):
    cluster_keywords = defaultdict(dict)
    
    for keyword, label in zip(cleaned_keywords, cluster_labels):
        for word in keyword:
            cluster_keywords[label][word] = cluster_keywords[label].get(word, 0) + 1
        
    return cluster_keywords


def print_cluster_keywords(cluster_keywords, i):
    keyword_data = []
    for cluster_label, keywords in cluster_keywords.items():
        print(f"Cluster {cluster_label}:")
        for keyword, count in keywords.items():
            #of_cluster = count / n_keywords_in_cluster
            of_all = count / 22962 #hardcoded number of all keywords in all clusters
            keyword_data.append([keyword, count, of_all])
            print(f"{keyword} (Count: {count})")
        df = pd.DataFrame(keyword_data, columns=['keyword', 'count', 'of_all'])
        df.to_csv(f'cluster_{cluster_label}_keywords_from_{i}_clusters.csv', index=False)
        keyword_data.clear()

In [None]:
#loop through clusters
for i in range(0, 7):
  #output = cluster_data(document_list, i, i)
  #reduced_docs, cluster_labels = output[0], output[1]
  #cluster_keywords = get_cluster_keywords(keywords, cluster_labels)
  #print_cluster_keywords(cluster_keywords, i)

In [None]:
#loop through clusters with DBSCAN #do sprawdzenia jak wrócę
for i in range(1, 20):
    epsilon = i / 50
    min = 1
    title = f'DBSCAN doc. clusters. eps={epsilon} min={min}'
    output = cluster_data(document_list, epsilon, min)

    reduced_docs, cluster_labels = output[0], output[1]

    draw_viz(reduced_docs, cluster_labels, title)

In [None]:
#epsilon 11 min 10

epsilon = 0.011
min = 10
title = f'Kmeans doc. clusters for n={min}' #f'DBSCAN doc. clusters. eps={epsilon} min={min}'
output = cluster_data(document_list, epsilon, min)

reduced_docs, cluster_labels = output[0], output[1]

print_top_keywords_for_each_cluster(cluster_labels)

#draw_viz(reduced_docs, cluster_labels, title)