In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d shivanandmn/multilabel-classification-dataset

Downloading multilabel-classification-dataset.zip to /content
  0% 0.00/11.4M [00:00<?, ?B/s] 70% 8.00M/11.4M [00:00<00:00, 80.5MB/s]
100% 11.4M/11.4M [00:00<00:00, 73.4MB/s]


In [None]:
!unzip /content/multilabel-classification-dataset.zip

Archive:  /content/multilabel-classification-dataset.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
train_path = '/content/train.csv'
test_path = '/content/test.csv'

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
train_df['ABSTRACT'] = train_df['ABSTRACT'].map(lambda com : clean_text(com))

In [None]:
X = train_df.ABSTRACT
test_X = test_df.ABSTRACT

In [None]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words='english')
vect

In [None]:
X_vec = vect.fit_transform(X)
X_vec

<20972x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1182370 stored elements in Compressed Sparse Row format>

In [None]:
test_X_vec = vect.transform(test_X)

In [None]:
X_vec

<20972x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1182370 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

submission_chains = pd.read_csv('/content/sample_submission.csv')

# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
cols_target = ['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


weights = np.random.randn(X_vec.shape[1], 1)
biases = np.zeros((1, 1))

for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label].values.astype(float)


    X_vec_dense = X_vec.toarray()
    test_X_vec_dense = test_X_vec.toarray()


    for epoch in range(20):

        output = sigmoid(np.dot(X_vec_dense, weights) + biases)


        error = output - y.reshape(-1, 1)
        weights -= np.dot(X_vec_dense.T, error)
        biases -= np.sum(error)


    y_pred_X = (sigmoid(np.dot(X_vec_dense, weights) + biases) > 0.5).astype(float)
    print('Training Accuracy is {}'.format(accuracy_score(y, y_pred_X)))


    test_y_prob = sigmoid(np.dot(test_X_vec_dense, weights) + biases)
    submission_chains[label] = test_y_prob


    X_dtm = add_feature(X_vec_dense, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))


... Processing Computer Science


  return 1 / (1 + np.exp(-x))


Training Accuracy is 0.4097844745374785
Shape of X_dtm is now (20972, 5001)
... Processing Physics


  return 1 / (1 + np.exp(-x))


Training Accuracy is 0.7132843791722296


  return 1 / (1 + np.exp(-x))


Shape of X_dtm is now (20972, 5001)
... Processing Mathematics
Training Accuracy is 0.7321190158306313


  return 1 / (1 + np.exp(-x))


Shape of X_dtm is now (20972, 5001)
... Processing Statistics
Training Accuracy is 0.7517642571047111


  return 1 / (1 + np.exp(-x))


Shape of X_dtm is now (20972, 5001)
... Processing Quantitative Biology
Training Accuracy is 0.9720102994468816


  return 1 / (1 + np.exp(-x))


Shape of X_dtm is now (20972, 5001)
... Processing Quantitative Finance
Training Accuracy is 0.9881270265115392


  return 1 / (1 + np.exp(-x))


Shape of X_dtm is now (20972, 5001)


In [None]:
def binary_cross_entropy(y_true, y_pred):


    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)


    bce = - (y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))


    mean_bce = np.mean(bce)

    return mean_bce

In [None]:
print('Training Accuracy is {}'.format(accuracy_score(y, y_pred_X)))
print('Training LOSS is {}'.format(binary_cross_entropy(y, y_pred_X)))


Training Accuracy is 0.9881270265115392
Training LOSS is 0.4100779764605841
