# Context
This Challenge concerns the problem of text analysis using automatic natural language processing (NLP) and machine learning (ML) techniques. The dataset used here is part of a questionnaire conducted during the first containment in France in 2020 to survey people over 75 years old about their perception of the Coronavirus crisis. To give you an idea of the types of questions asked, the first block of questions concerns the perception of the danger of the Coronavirus. It includes three questions:

* Do you consider CORONAVIRUS as: no danger or low danger, moderate danger, serious danger.
* On a scale of 0 to 10, how intense have you been in the last few days?
* Tell us what concerns you have?

The first two questions are closed questions and will not be covered by this challenge. However, the third question invites people to express themselves openly with their own sentences and the analysis of these is the subject of this challenge. More precisely, the objective here will be to categorise each response into one or two at a time from four categories (classes). This problem is known in the literature as MultiLabel Classification.

In [1]:
import string
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd

In [3]:
X_train=pd.read_csv('/kaggle/input/chal-data-nlp/X_train.csv',sep=';')
X_train.head()

Unnamed: 0,Id,Caption
0,2,de mourir avant l'heure\n de ne plus revoir me...
1,3,la maladie pour les autres et pour moi\n et le...
2,4,Comment vont s'en sortir ceux qui sont mal ou ...
3,6,"Inquiétude pour la santé de mes proches, pour ..."
4,7,"Bien entendu contracter la maladie,"


In [4]:
X_test=pd.read_csv('/kaggle/input/chal-data-nlp/X_test.csv',sep=';')
X_test.head()

Unnamed: 0,Id,Caption
0,599,quand tout cela va t il se terminer ?
1,600,Que le futur demeure incertain... qu'on ne pui...
2,602,De ne plus pouvoir vivre comme auparavant.
3,603,pour mes proches
4,604,la honte !!! d'être un des pays les plus rich...


In [5]:
y_train=pd.read_csv('/kaggle/input/chal-data-nlp/y_train.csv',sep=';')
y_train.drop(['Id'],axis=1,inplace=True)
y_train.head()

Unnamed: 0,category_1,category_2,category_3,category_4
0,1,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [6]:
n_label=pd.read_csv('/kaggle/input/chal-data-nlp/nonlabeled_data.csv',sep=';')
n_label.head()

Unnamed: 0,Id,Caption
0,804,"Attraper le virus\n Respiration, de l'astheme\..."
1,805,les dysfonctionnements (aggravation des inégal...
2,806,Un déconfinement moyennement observé et peu ef...
3,807,Contracter le virus et le transmettre a mon mari
4,808,attraper ce virus


In [7]:
# turn a doc into clean tokens
def clean_doc(doc, vocab=None, join=False):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('french'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    # filter out tokens not in vocab
    if vocab:
        tokens = [w for w in tokens if w in vocab]
    if join:
        tokens = ' '.join(tokens)
    return tokens

In [8]:
#clean our documents (Xtrain,X_test and unlabeled_data)
doc=X_train.Caption.to_string()
token=clean_doc(doc)

doc2=n_label.Caption.to_string()
token2=clean_doc(doc2)

doc3=X_test.Caption.to_string()
token3=clean_doc(doc3)

In [9]:
#add documents to our vocabulary
vocab = Counter()
vocab.update(token)
vocab.update(token2)
vocab.update(token3)

In [10]:
# save list to file
def save_list(tokens, filename):
    # convert lines to a single blob of text
    lines = '\n'.join(tokens)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(lines)
    # close file
    file.close()

In [11]:
#Create a vocabulary list with words that have at least 2 occurrences.
min_occurance = 2
tokens = [k for k, c in vocab.items() if c >= min_occurance]
print("token count", len(tokens))
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

token count 1355


In [12]:
#Use only words in the vocab
def doc_to_line(doc, vocab):
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [13]:
#We keep the vocabulary words only
doc=[]
for i in range(len(X_train)):
  text=X_train.Caption[i]
  doc.append(doc_to_line(text,tokens))

In [14]:
#We keep the vocabulary words only
test=[]
for i in range(len(X_test)):
  text=X_test.Caption[i]
  test.append(doc_to_line(text,tokens))

In [15]:
#check the max length for training captions
max_length = max([len(k) for k in doc])
max_length

487

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords=stopwords.words('french')
vect = TfidfVectorizer(max_features=5000,stop_words=stopwords)
vect

TfidfVectorizer(max_features=5000,
                stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de',
                            'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils',
                            'je', 'la', 'le', 'les', 'leur', 'lui', 'ma',
                            'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne',
                            'nos', ...])

In [17]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(doc)
# examine the document-term matrix created from X_train
X_dtm

<485x737 sparse matrix of type '<class 'numpy.float64'>'
	with 3853 stored elements in Compressed Sparse Row format>

In [18]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test)
# examine the document-term matrix from X_test
test_X_dtm

<157x737 sparse matrix of type '<class 'numpy.float64'>'
	with 1245 stored elements in Compressed Sparse Row format>

In [19]:
cols_target=['category_1','category_2','category_3','category_4']

In [20]:
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=12.0)

for label in cols_target:
    print('... Processing {}'.format(label))
    y = y_train[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm, y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
    

... Processing category_1
Training accuracy is 0.9917525773195877
... Processing category_2
Training accuracy is 0.9917525773195877
... Processing category_3
Training accuracy is 0.9938144329896907
... Processing category_4
Training accuracy is 0.9876288659793815


In [21]:
# create submission file
submission_binary = X_test.copy()
submission_binary.drop(['Caption'],axis=1,inplace=True)

for label in cols_target:
    print('... Processing {}'.format(label))
    y = y_train[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm, y)
    # predict test labels
    y_pred = logreg.predict(test_X_dtm)
    submission_binary[label] = y_pred

... Processing category_1
... Processing category_2
... Processing category_3
... Processing category_4


In [22]:
#check submissions
submission_binary.head()

Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,1,0
1,600,0,0,1,0
2,602,0,0,1,1
3,603,0,1,0,0
4,604,0,0,0,0


In [23]:
# if the first 3 categories are at 0, we put the 4th at 1
for i in range(len(submission_binary)):
  if submission_binary['category_1'].loc[i]==0 and submission_binary['category_2'].loc[i]==0 and submission_binary['category_3'].loc[i]==0:
    submission_binary['category_4'].loc[i]=1

submission_binary

Unnamed: 0,Id,category_1,category_2,category_3,category_4
0,599,0,0,1,0
1,600,0,0,1,0
2,602,0,0,1,1
3,603,0,1,0,0
4,604,0,0,0,1
...,...,...,...,...,...
152,798,0,0,1,0
153,799,0,0,1,0
154,800,1,0,0,0
155,801,0,1,0,0


In [24]:
# generate submission file
submission_binary.to_csv('Submissions.csv',index=False)