In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

Downloading multilabel-classification-dataset.zip to /content
 87% 10.0M/11.4M [00:01<00:00, 12.0MB/s]
100% 11.4M/11.4M [00:01<00:00, 7.52MB/s]


In [None]:
!unzip /content/multilabel-classification-dataset.zip

Archive:  /content/multilabel-classification-dataset.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
train_path = '/content/train.csv'
test_path = '/content/test.csv'

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
train_df['ABSTRACT'] = train_df['ABSTRACT'].map(lambda com : clean_text(com))

In [None]:
X = train_df.ABSTRACT
test_X = test_df.ABSTRACT

In [None]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words='english')
vect

In [None]:
X_vec = vect.fit_transform(X)
X_vec

<20972x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1182453 stored elements in Compressed Sparse Row format>

In [None]:
test_X_vec = vect.transform(test_X)

In [None]:
X_vec

<20972x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1182453 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

submission_chains = pd.read_csv('/content/sample_submission.csv')

# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
cols_target = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']


In [None]:
import tensorflow as tf
from sklearn.metrics import accuracy_score



for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label].values.astype(float)


    X_vec_dense = X_vec.toarray()
    test_X_vec_dense = test_X_vec.toarray()


    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_vec_dense.shape[1],)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])


    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


    model.fit(X_vec_dense, y, epochs=20, batch_size=32)


    y_pred_X = (model.predict(X_vec_dense) > 0.5).astype(float)
    print('Training Accuracy is {}'.format(accuracy_score(y, y_pred_X)))


    test_y_prob = model.predict(test_X_vec_dense)
    submission_chains[label] = test_y_prob


    X_dtm = add_feature(X_vec_dense, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))


... Processing Computer Science
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy is 0.8938584779706275
Shape of X_dtm is now (20972, 5001)
... Processing Physics
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy is 0.9505054358191874
Shape of X_dtm is now (20972, 5001)
... Processing Mathematics
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy is 0.9250905969864581
Shape of X_dtm is now (20972, 500

In [None]:
"""
Epoch 20/20
656/656 [==============================] - 2s 3ms/step - loss: 0.0191 - accuracy: 0.9924
656/656 [==============================] - 2s 2ms/step
Training Accuracy is 0.9927522410833493
""