In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/content/data.csv')

In [3]:
data.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
data.shape

(20972, 8)

In [5]:
data.isna().sum()

TITLE                   0
ABSTRACT                0
Computer Science        0
Physics                 0
Mathematics             0
Statistics              0
Quantitative Biology    0
Quantitative Finance    0
dtype: int64

In [6]:
data.duplicated().sum()

0

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20972 entries, 0 to 20971
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   TITLE                 20972 non-null  object
 1   ABSTRACT              20972 non-null  object
 2   Computer Science      20972 non-null  int64 
 3   Physics               20972 non-null  int64 
 4   Mathematics           20972 non-null  int64 
 5   Statistics            20972 non-null  int64 
 6   Quantitative Biology  20972 non-null  int64 
 7   Quantitative Finance  20972 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.3+ MB


In [8]:
data['text'] = data['TITLE'] + ' ' + data['ABSTRACT']

In [9]:
labels = data[['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf_vectorizer.fit_transform(data['text'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

In [13]:
X_train.shape

(16777, 5000)

In [33]:
model = Sequential()
model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(82, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(labels.shape[1], activation='sigmoid'))

In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [38]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32,verbose=0)

In [42]:
pred_test = model.predict(X_test)
accuracy_test = accuracy_score(y_test, pred_test.round())
print(f'Test Accuracy: {accuracy_test * 100:.2f}%')

Test Accuracy: 63.27%
