In [None]:
import pandas as pd

In [None]:
# get projects data
df_project = pd.read_excel("data/project.xlsx")

In [None]:
df_project.head()

In [None]:
# add classification identifiers
df_euro = pd.read_excel("data/euroSciVoc.xlsx")

In [None]:
df_euro.head()

In [None]:
# change id name for merging
df_euro["id"] = df_euro["projectID"]

In [None]:
# merge the two
result = pd.merge(df_project, df_euro, on="id")

In [None]:
result.head()

In [None]:
# look only at MSCA PF
df = result[result['topics'].str.contains("MSCA") & result['topics'].str.contains("PF")]

In [None]:
df['euroSciVocPath']

In [None]:
# let's extract the main topic
df['mainTopic'] = df['euroSciVocPath'].str.split('/').apply(lambda x: x[1] if len(x) > 1 else None)

In [None]:
# let's look at the main topics
df['mainTopic'].value_counts()

In [None]:
# economics
df[df['euroSciVocPath'].str.contains("economics")]['euroSciVocPath']

In [None]:
#physics
df[df['euroSciVocPath'].str.contains("physical sciences")]['euroSciVocPath']

In [None]:
# chemistry
df[df['euroSciVocPath'].str.contains("chemical sciences")]['euroSciVocPath']

In [None]:
# engineering
df[df['euroSciVocPath'].str.contains("engineering and technology")]['euroSciVocPath']

In [None]:
# environmental and geosciences
df[df['euroSciVocPath'].str.contains("environment")]['euroSciVocPath']

In [None]:
# life sciences
df[df['euroSciVocPath'].str.contains("medical and health")]['euroSciVocPath']

In [None]:
# math
df[df['euroSciVocPath'].str.contains("math")]['euroSciVocPath']

In [None]:
# social sciences
df[~df['euroSciVocPath'].str.contains("economics") & df['euroSciVocPath'].str.contains("social sciences")]['euroSciVocPath']

### Only for social sciences

In [None]:
soc = df[~df['euroSciVocPath'].str.contains("economics") & df['euroSciVocPath'].str.contains("social sciences")]

In [None]:
soc.head()

In [None]:
soc[soc['euroSciVocPath'].str.contains("sociology")]

### Topic Analysis

In [None]:
import spacy

print(spacy.__version__)
import numpy as np
np.random.seed(0)
from pprint import pprint
from time import time

import os,sys
import math
import csv

## custom packages
src_dir = os.path.join( 'src')
sys.path.append(src_dir)

from filter_words import run_stopword_statistics
from filter_words import make_stopwords_filter
from filter_words import remove_stopwords_from_list_texts

import spacy
nlp = spacy.load("en_core_web_md")

from helper_functions import clean_stopwords, coherence_per_topic, find_best_n_topics, get_clean_output, get_list, get_top_n_words, get_topics_from_model, has_numbers, my_lemmatizer, plot_top_words, plot_top_words_colors, process_words

import pandas as pd
import gensim
import re

from time import time
import matplotlib.pyplot as plt

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection  import GridSearchCV
from gensim.models.coherencemodel import CoherenceModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from joblib import dump, load
import seaborn as sns

## Stopwords

In [None]:
## creating set of stopwords using Sarica and Luo 2021 paper
path_stopword_list =  os.path.join('data','sarica_and_luo_2021.txt')
if path_stopword_list != None:
    with open(path_stopword_list,'r', encoding='utf-8') as f:
        x = f.readlines()
    stopwords = set([word.lower() for h in x for word in h.strip().split(' ')])
    
## remove all acronyms
stopwords = stopwords.union(set(df['acronym']))

In [None]:
texts = list(soc['objective'])

In [None]:
len(texts[2])

In [None]:
"" in texts

## Clean stopwords and uninformative words

In [None]:
lst = get_list(texts = texts, stop_words = stopwords, processing_choice='nouns', N_s=100, cutoff_val=0.5, path_to_file = 'data/obj_')
output = get_clean_output(lst)

## Models

In [None]:
def scorer_sklearn(estimator, X,y=None):
    
    topics = get_topics_from_model(
        estimator,
        vectorizer,
        n_top_words
    )
    cm = CoherenceModel(
        topics=topics,
        texts = output['list_texts_filter'],
        corpus=output['corpus_filter'], 
        dictionary=output['dictionary_filter'],  
        coherence='c_v', 
        topn=n_top_words,
        processes=1
    )

    return cm.get_coherence()

In [None]:
texts_nouns = output['texts_filter']

n_samples = len(texts_nouns)
n_features = output['n_features'] # from above at corpus
n_components = 50
n_top_words = 20


vectorizer = TfidfVectorizer(
    #max_features=n_features, stop_words=list(stopwords)
)

tfidf = vectorizer.fit_transform(texts_nouns)

In [None]:
tfidf.shape

In [None]:
model = NMF(
    n_components=n_components,
    random_state=82,
    beta_loss="kullback-leibler",
    init="nndsvda",
    solver="mu",
    max_iter=200,
    alpha_W=0.01,
    l1_ratio=0.2,
).fit(tfidf)

scorer_sklearn(model, texts_nouns)

In [None]:
tf_feature_names = vectorizer.get_feature_names_out()
plot_top_words(model, tf_feature_names, n_top_words, "title");

## Viz

In [None]:
from __future__ import print_function

import pyLDAvis

import pyLDAvis.lda_model
pyLDAvis.enable_notebook()

viz = pyLDAvis.lda_model.prepare(model, tfidf, vectorizer)
viz

### Physics 

In [None]:
phys = df[df['euroSciVocPath'].str.contains("physical sciences")]

In [None]:
texts = list(phys['objective'])

In [None]:
len(texts)

In [None]:
lst = get_list(texts = texts, stop_words = stopwords, processing_choice='nouns', N_s=100, cutoff_val=0.5, path_to_file = 'data/phys_')
output = get_clean_output(lst)

In [None]:
texts_nouns = output['texts_filter']

n_samples = len(texts_nouns)
n_features = output['n_features'] # from above at corpus
n_components = 50
n_top_words = 20


vectorizer = TfidfVectorizer(
    #max_features=n_features, stop_words=list(stopwords)
)

tfidf = vectorizer.fit_transform(texts_nouns)

In [None]:
model = NMF(
    n_components=n_components,
    random_state=82,
    beta_loss="kullback-leibler",
    init="nndsvda",
    solver="mu",
    max_iter=200,
    alpha_W=0.01,
    l1_ratio=0.2,
).fit(tfidf)

scorer_sklearn(model, texts_nouns)

In [None]:
tf_feature_names = vectorizer.get_feature_names_out()
plot_top_words(model, tf_feature_names, n_top_words, "title");

In [None]:
from __future__ import print_function

import pyLDAvis

import pyLDAvis.lda_model
pyLDAvis.enable_notebook()

viz = pyLDAvis.lda_model.prepare(model, tfidf, vectorizer)
viz