In [12]:
import pandas as pd
import nltk
from tqdm import tqdm

In [17]:
df = pd.read_csv("dblp-v10-titles_and_date-only.csv")
df = df.head(10000)

In [14]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\romaf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\romaf\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [15]:
def token_and_filter_pos_combos(text_corpus):
      
    corpus = []
    for sent in tqdm(text_corpus):

        sent_tokens = []
        
        words = []
        for word,tag in nltk.pos_tag(nltk.word_tokenize(sent)):
            if tag in {"JJ","JJR","JJS","NN","NNS","NNP","NNPS"}:
                words.append(word)
            else:
                if len(words) > 0:
                    sent_tokens.append(" ".join(words))
                words = []
        if len(words) > 0:
                    sent_tokens.append(" ".join(words))
        corpus.append(sent_tokens)
    return(corpus)

In [18]:
terms = token_and_filter_pos_combos(df['title'].to_list())

100%|██████████| 10000/10000 [00:06<00:00, 1496.49it/s]


In [19]:
import math
from collections import Counter

def compute_tf_icf(sentence_terms):
    # Flatten list of terms for TF computation
    flattened_terms = [term for sentence in sentence_terms for term in sentence]
    term_counts = Counter(flattened_terms)
    
    # Total number of terms
    total_terms = len(flattened_terms)
    
    # Compute TF for each term
    tf = {term: count / total_terms for term, count in term_counts.items()}
    
    # Compute ICF for each term
    total_sentences = len(sentence_terms)
    term_in_sentences = Counter(term for sentence in sentence_terms for term in set(sentence))
    icf = {term: math.log(total_sentences / (1 + count)) for term, count in term_in_sentences.items()}
    
    # Compute TF-ICF
    tf_icf = {term: tf[term] * icf[term] for term in tf.keys()}
    
    return tf_icf

In [20]:
tf_icf_scores = compute_tf_icf(terms)

In [24]:
from collections import Counter

yearly_terms = {year: [] for year in df['year'].unique()}

for terms, y in zip(terms,df['year']):
    for term in terms:
        if tf_icf_scores[term] > 0.001:
            yearly_terms[y].append(term)

In [39]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [50]:
year_terms_scores = {year: [] for year in df['year'].unique()}

for year, terms in yearly_terms.items():
    scores = MinMaxScaler((0.1,1)).fit_transform(np.array(list(Counter(yearly_terms[2008]).values())).reshape(-1,1))
    year_terms_scores[year] = list(zip(terms,scores))

In [51]:
year_terms_scores

{2008: [('new approach', array([0.1])),
  ('image segmentation', array([0.1])),
  ('Application', array([0.9])),
  ('Integration', array([0.2])),
  ('Wireless Ad Hoc Networks', array([0.2])),
  ('people', array([0.1])),
  ('mobile robot', array([0.2])),
  ('evolution', array([0.2])),
  ('Research', array([0.9])),
  ('IP', array([0.3])),
  ('control', array([0.4])),
  ('OFDM systems', array([0.1])),
  ('Application', array([0.2])),
  ('Research', array([0.2])),
  ('tracking', array([0.1])),
  ('Impact', array([0.3])),
  ('performance', array([0.8])),
  ('objects', array([0.1])),
  ('Application', array([0.2])),
  ('Analysis', array([0.3])),
  ('effects', array([0.2])),
  ('Number', array([0.1])),
  ('H.264/AVC', array([0.5])),
  ('Sensor Networks', array([0.3])),
  ('Distributed', array([0.2])),
  ('Design', array([0.1])),
  ('Efficient', array([0.5])),
  ('implications', array([0.2])),
  ('power', array([0.1])),
  ('Study', array([0.2])),
  ('Integration', array([0.2])),
  ('simulation