### We start by importing our libraries and load the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.naive_bayes import ComplementNB

In [2]:
df=pd.read_csv('/Users/mohsenboughriou/Downloads/archive/train.csv')

In [3]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [25]:
c=df[df.columns[3:]]
c

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,1,0,0,1,0,0
...,...,...,...,...,...,...
20967,1,1,0,0,0,0
20968,0,1,0,0,0,0
20969,1,0,0,0,0,0
20970,0,0,1,1,0,0


In [26]:
classes=np.sum([c],axis=1)
classes

array([[8594, 6013, 5618, 5206,  587,  249]])

### We see that the the last two topics have a very small representation in the dataset. We simply drop them!

In [27]:
topics=df[df.columns[3:7]]
topics=pd.DataFrame(topics)

In [29]:
np.sum(topics,axis=0)

Computer Science    8594
Physics             6013
Mathematics         5618
Statistics          5206
dtype: int64

### We create one target column that represents the 4 classes together by values from 0 to 3.

In [7]:
for row in range(len(topics)):
    t=np.argmax(topics.values, axis=1)
t

array([0, 0, 2, ..., 0, 2, 2])

In [8]:
def unique(list1):    
    list_set = set(list1)
    unique_list = (list(list_set))
    for x in unique_list:
        print(x)
print('the numerical representation of our classes is as follows:')
unique(t)

the numerical representation of our classes is as follows:
0
1
2
3


In [9]:
y=np.array(t)

### We combine the title and abstract columns together and then we clean our text.

In [10]:
X=df['TITLE']+df['ABSTRACT']
X=X.str.lower()
X=X.str.split()
stopwords=set(stopwords.words('english'))
X=pd.Series(X)
    
X=X.apply(lambda x:[word for word in x if word not in stopwords])
for i in range (len(X)):
    X[i]=' '.join(X[i])
X= X.str.replace('[0-9]', '')
X

  X= X.str.replace('[0-9]', '')


0        reconstructing subject-specific effect maps pr...
1        rotation invariance neural network rotation in...
2        spherical polyharmonics poisson kernels polyha...
3        finite element approximation stochastic maxwel...
4        comparative study discrete wavelet transforms ...
                               ...                        
20967    contemporary machine learning: guide practitio...
20968    uniform diamond coatings wc-co hard alloy cutt...
20969    analysing soccer games clustering conceptors p...
20970    efficient simulation left-tail sum correlated ...
20971    optional stopping problem bayesians recently, ...
Length: 20972, dtype: object

### We apply the TFIDF algorithm

In [11]:
vectorizer = TfidfVectorizer()
X= vectorizer.fit_transform(X).toarray()

In [12]:
print(X.shape)

(20972, 52264)


In [13]:
X=np.array(X)

### We split our data into training and test sets.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


### Because our classes are imblanced, we use the Complement Naive Bayes algorithm for this classification task.

In [15]:
classifier = ComplementNB()  
classifier.fit(X_train, y_train)  

In [16]:
y_pred_test=classifier.predict(X_test)
y_pred_train=classifier.predict(X_train)

In [17]:
print (f" Accuracy of Training Set: {accuracy_score(y_train, y_pred_train) * 100:.3} %\n")  
print (f" Accuracy of Test Set: {accuracy_score(y_test, y_pred_test) * 100:.3} % \n\n")  

 Accuracy of Training Set: 83.8 %

 Accuracy of Test Set: 80.2 % 




In [18]:
print (f" Classifier Report : \n\n {classification_report (y_test, y_pred_test)}")

 Classifier Report : 

               precision    recall  f1-score   support

           0       0.75      0.92      0.83      1824
           1       0.91      0.89      0.90      1130
           2       0.82      0.75      0.78       898
           3       0.17      0.00      0.01       343

    accuracy                           0.80      4195
   macro avg       0.66      0.64      0.63      4195
weighted avg       0.76      0.80      0.77      4195



### The classifier seems to be performing well on some classes but poorly on others.

### The precision and recall scores for class 0 and class 1 are high, indicating that the classifier is able to correctly identify these classes with high accuracy and low false positives. The precision and recall scores for class 2 are also good, indicating good performance for this class as well.

### However, the precision and recall scores for class 3 are very low, indicating poor performance for this class. In particular, the recall score for class 3 is extremely low, suggesting that the classifier is not identifying many instances of this class. This actually makes sense because statistcs and mathematics are two topics sharing many commun terms and notations which makes it hard for the classifier to properly classify statistics class and sticks always to mathematics class. The macro and weighted average f1-scores are low, which is likely due to the poor performance on class 3.

### Overall, the accuracy of the classifier is good, but the macro and weighted average f1-scores are low, indicating that the classifier may need improvement on class 3.


## Thank you !