### IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# read data
df_train = pd.read_csv("../dataset/topic/train.csv")
df_test = pd.read_csv("../dataset/topic/test.csv")

In [4]:
df_train.shape

(20972, 9)

In [5]:
df_train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [7]:
# find duplicated
dup = df_train.duplicated(subset="ABSTRACT")

In [8]:
df_train[dup]

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance


In [11]:
# drop id column
df_train.drop("ID", axis=1, inplace=True)

In [12]:
# create variables for ml
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words="english")
nmf_model = NMF(n_components=6, random_state=0)

In [13]:
# create x
X = df_train["ABSTRACT"]
y = df_test["ABSTRACT"]

In [14]:
X.shape

(20972,)

In [15]:
y.shape

(8989,)

### X-Y transform before the training

In [16]:
# count vectorizer
X_train = cv.fit_transform(X)

In [17]:
y_test = cv.transform(y)

In [18]:
X_train.shape

(20972, 27580)

In [19]:
y_test.shape

(8989, 27580)

In [20]:
# fit the nmf
nmf_model.fit(X_train)

In [23]:
# most important words 
for index, topic in enumerate(nmf_model.components_):
    print(f"TOPIC NUM {index+1}", "-"*20, sep="\n")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print("*"*20, "\n")

TOPIC NUM 1
--------------------
['propose', 'tasks', 'using', 'models', 'task', 'neural', 'approach', 'machine', 'performance', 'method', 'training', 'methods', 'deep', 'based', 'learning']
******************** 

TOPIC NUM 2
--------------------
['optimization', 'function', 'linear', 'optimal', 'proposed', 'based', 'results', 'algorithms', 'problems', 'number', 'paper', 'method', 'time', 'algorithm', 'problem']
******************** 

TOPIC NUM 3
--------------------
['time', 'use', 'high', 'set', 'methods', 'based', 'large', 'real', 'information', 'used', 'method', 'approach', 'using', 'analysis', 'data']
******************** 

TOPIC NUM 4
--------------------
['prediction', 'used', 'using', 'dynamics', 'approach', 'time', 'process', 'bayesian', 'distribution', 'inference', 'proposed', 'parameters', 'based', 'models', 'model']
******************** 

TOPIC NUM 5
--------------------
['density', 'temperature', 'low', 'study', 'quantum', 'mass', 'systems', 'spin', 'results', 'magnetic', 

In [24]:
predicted = nmf_model.transform(y_test).argmax(axis=1)

In [25]:
predicted

array([3, 4, 3, ..., 2, 1, 1])

In [26]:
df_test["Topics"] = predicted

In [27]:
df_test.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Topics
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...,3
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...,4
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...,3
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...,4
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...,1
