In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your documents from JSON file
df = pd.read_json('updated_articles.json')

documents = list(df['content'])




In [2]:
%%capture
!pip install top2vec
!pip install top2vec[sentence_transformers]
!pip install top2vec[indexing]

In [3]:
from top2vec import Top2Vec

model = Top2Vec(documents)

2023-06-07 12:28:40,303 - top2vec - INFO - Pre-processing documents for training
2023-06-07 12:28:55,399 - top2vec - INFO - Creating joint document/word embedding
2023-06-07 12:41:12,075 - top2vec - INFO - Creating lower dimension embedding of documents
2023-06-07 12:41:40,164 - top2vec - INFO - Finding dense areas of documents
2023-06-07 12:41:40,317 - top2vec - INFO - Finding topics


In [None]:
model.get_num_topics()

documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=48, num_docs=5)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

topic_vectors = model.topic_vectors

# Get document vectors
document_vectors = model.document_vectors

# Compute cosine similarity between each document and each topic
similarity_scores = cosine_similarity(document_vectors, topic_vectors)

In [7]:
similarity_scores

array([[ 0.60097307,  0.11686211,  0.1529768 , ...,  0.18878374,
         0.09737724,  0.02042905],
       [ 0.08555493,  0.41281393,  0.10883187, ...,  0.04536572,
         0.08899521,  0.06402918],
       [ 0.2375367 ,  0.16252168,  0.1065956 , ...,  0.07200617,
        -0.00139157,  0.01200845],
       ...,
       [ 0.18150114,  0.08964967,  0.10746752, ...,  0.07415494,
         0.11433862,  0.01120989],
       [ 0.11846441,  0.02828089,  0.18154725, ...,  0.13294168,
         0.08349875,  0.08518001],
       [ 0.14913788,  0.09929604,  0.17272446, ...,  0.12060062,
         0.12759559,  0.15703005]], dtype=float32)

In [8]:
import numpy as np

# Add a new column for each topic
for i in range(model.get_num_topics()):
    df[f'topic_{i}_similarity'] = similarity_scores[:, i]

# The DataFrame now contains a similarity score for each topic, for each document


In [9]:
df.head()

Unnamed: 0,title,content,url,date,topic_0_similarity,topic_1_similarity,topic_2_similarity,topic_3_similarity,topic_4_similarity,topic_5_similarity,...,topic_89_similarity,topic_90_similarity,topic_91_similarity,topic_92_similarity,topic_93_similarity,topic_94_similarity,topic_95_similarity,topic_96_similarity,topic_97_similarity,topic_98_similarity
0,\n Sheffield waste site closed down by the ...,Sheffield waste site closed down by the ...,https://www.gov.uk/government/news/sheffield-w...,24 March 2023,0.600973,0.116862,0.152977,0.126532,0.065291,0.057118,...,0.094514,0.149445,0.048605,0.091044,0.060931,0.077867,0.060129,0.188784,0.097377,0.020429
1,\n Repairs underway as Environment Agency b...,Repairs underway as Environment Agency b...,https://www.gov.uk/government/news/repairs-und...,4 December 2019,0.085555,0.412814,0.108832,0.127525,0.079613,0.09698,...,0.080423,0.074399,0.130517,0.066417,-0.007342,0.087038,0.077386,0.045366,0.088995,0.064029
2,\n Environment Agency prosecutes Lancashire...,Environment Agency prosecutes Lancashire...,https://www.gov.uk/government/news/environment...,9 October 2019,0.237537,0.162522,0.106596,0.026424,0.087227,0.116541,...,0.075881,0.077766,0.174701,0.199341,0.066111,0.073539,0.080055,0.072006,-0.001392,0.012008
3,\n Defra environmental targets consultation...,Defra environmental targets consultation...,https://www.gov.uk/government/publications/def...,9 May 2022,0.008281,-0.070898,0.147877,0.16077,0.157935,0.118342,...,0.240078,0.168988,-0.005139,0.187777,0.054546,-0.049893,0.089565,0.043162,0.076417,0.05855
4,\n Environment Agency completes third phase...,Environment Agency completes third phase...,https://www.gov.uk/government/news/environment...,1 November 2019,0.048107,0.298188,0.154571,0.192587,0.10283,0.110981,...,0.050015,0.091827,0.102754,0.105185,-0.003325,0.147307,-0.049214,0.05361,0.092268,0.010821


In [10]:
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y', errors='coerce')
df.index = df['date']
df['year-month'] = df.index.to_period('M')
df['Date (by month)'] = df['year-month'].dt.to_timestamp()
df = df.set_index('Date (by month)') 
columns = [f'topic_{i}_similarity' for i in range(30)]
columns.append('year-month')
df = df[columns]

  df['year-month'] = df.index.to_period('M')
  df['Date (by month)'] = df['year-month'].dt.to_timestamp()


In [11]:
df

Unnamed: 0_level_0,topic_0_similarity,topic_1_similarity,topic_2_similarity,topic_3_similarity,topic_4_similarity,topic_5_similarity,topic_6_similarity,topic_7_similarity,topic_8_similarity,topic_9_similarity,...,topic_21_similarity,topic_22_similarity,topic_23_similarity,topic_24_similarity,topic_25_similarity,topic_26_similarity,topic_27_similarity,topic_28_similarity,topic_29_similarity,year-month
Date (by month),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-01,0.600973,0.116862,0.152977,0.126532,0.065291,0.057118,0.135699,0.061117,0.064079,0.144596,...,0.092566,0.052333,0.053400,0.160390,0.186194,0.213895,0.074957,0.157876,0.111489,2023-03
2019-12-01,0.085555,0.412814,0.108832,0.127525,0.079613,0.096980,0.128047,0.369517,0.138184,0.133855,...,0.155776,0.212945,0.025243,0.142756,0.108323,0.118673,0.028985,0.097481,0.093908,2019-12
2019-10-01,0.237537,0.162522,0.106596,0.026424,0.087227,0.116541,0.082645,0.098496,0.048983,0.093177,...,0.165720,-0.002969,0.037416,0.000263,0.013248,0.076760,-0.006454,0.103596,0.148852,2019-10
2022-05-01,0.008281,-0.070898,0.147877,0.160770,0.157935,0.118342,0.054254,-0.069619,0.119019,0.096035,...,0.037714,-0.119736,0.126655,0.108750,0.116068,0.083650,0.037453,0.048321,0.000309,2022-05
2019-11-01,0.048107,0.298188,0.154571,0.192587,0.102830,0.110981,0.140758,0.118302,0.174448,0.117888,...,0.094814,0.094853,0.058955,0.127516,0.051165,0.112200,0.036776,0.135322,0.023497,2019-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-04-01,0.020516,0.036847,0.521648,0.316311,0.151042,0.097944,0.255438,0.029151,0.280949,0.199152,...,0.022348,0.083780,0.225536,0.119354,0.142286,0.185334,0.116011,0.147450,0.016030,2019-04
2021-10-01,0.042681,0.156859,0.213683,0.225342,0.161340,0.133911,0.179224,0.019634,0.195036,0.204693,...,0.070786,0.024922,0.220774,0.366486,0.211140,0.134815,0.073847,0.187672,-0.032083,2021-10
2022-08-01,0.181501,0.089650,0.107468,0.104925,0.065287,0.143004,0.003181,0.058268,0.089805,0.240151,...,0.035332,0.051418,0.035767,0.143279,0.069771,0.164590,0.083749,0.057017,0.213693,2022-08
2017-09-01,0.118464,0.028281,0.181547,0.383438,0.172503,0.190236,0.137838,-0.029558,0.329641,0.150474,...,0.051907,-0.036328,0.027086,0.170900,0.228942,0.141577,0.154761,0.103971,0.013310,2017-09
