### target: either 1 (for insincere question) or 0

In [2]:
##! pip install textblob

In [3]:
! pip install -U sentence-transformers



In [4]:
# imports
import gensim
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from textblob import TextBlob
from sklearn.metrics import classification_report

In [5]:
df = pd.read_csv('Quora/quora_train.csv',on_bad_lines='skip')

In [6]:
## sample the dataset
df= resample(df, n_samples=15000, random_state=42)
df.shape

(15000, 3)

In [7]:
df.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [8]:
df.duplicated().sum() #86 rows
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [9]:
df.target.value_counts()
# imbalanced dataset

0    14002
1      912
Name: target, dtype: int64

## Utilisation de Word Embedding:
### on va procéder comme dans l'Exo-1 avec du rebalancing sur 1 avec du word embedding
### sur les tokens avec glove.6B.100.txt

# EDA

# PREPROCESSING

In [10]:
def preprocessing(document):
    # 1- tokenization
    tokens = word_tokenize(document)
    # 2- lower case of strings
    tokens = [t.lower() if t.isalpha() else t for t in tokens]
    # 3- remove stopwords
    stop_words = stopwords.words('english')
    tokens = [t for t in tokens if not t in stop_words]
    # 4- Stemming
    stemmer = PorterStemmer() #build root by removing some known suffix and prefix
    tokens = [stemmer.stem(w) for w in tokens]
    # 4- lemmatization
    #lemmatizer = WordNetLemmatizer()
    #tokens_lem = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

In [11]:
df['tokens'] = df.question_text.apply(preprocessing)
df.head()

Unnamed: 0,qid,question_text,target,tokens
121958,17df11bf9b1888d64add,I feel something missing in my relationship. I...,0,"[feel, someth, miss, relationship, ., dont, kn..."
671155,837181f0407fd112a05c,"What do Socrates, Thomas Kuhn and Karl Popper ...",0,"[socrat, ,, thoma, kuhn, karl, popper, common, ?]"
131932,19d09fe4bbabf16eb347,What song played in the movie of the gifted wh...,0,"[song, play, movi, gift, mr, bern, carson, lit..."
259178,32b9bf37edd829ffe658,What are the biggest myths about Adolf Hitler?,0,"[biggest, myth, adolf, hitler, ?]"
110268,1595cbbad9c20d3d791a,How can the ViewSonic PA503S 3600 lumens SVGA ...,0,"[viewson, pa503, 3600, lumen, svga, hdmi, proj..."


# SENTIMENT ANALYSIS

In [12]:
df["polarity"]     = df["question_text"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["subjectivity"] = df["question_text"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head()

Unnamed: 0,qid,question_text,target,tokens,polarity,subjectivity
121958,17df11bf9b1888d64add,I feel something missing in my relationship. I...,0,"[feel, someth, miss, relationship, ., dont, kn...",-0.253125,0.440625
671155,837181f0407fd112a05c,"What do Socrates, Thomas Kuhn and Karl Popper ...",0,"[socrat, ,, thoma, kuhn, karl, popper, common, ?]",-0.3,0.5
131932,19d09fe4bbabf16eb347,What song played in the movie of the gifted wh...,0,"[song, play, movi, gift, mr, bern, carson, lit...",0.15625,0.75
259178,32b9bf37edd829ffe658,What are the biggest myths about Adolf Hitler?,0,"[biggest, myth, adolf, hitler, ?]",0.0,0.0
110268,1595cbbad9c20d3d791a,How can the ViewSonic PA503S 3600 lumens SVGA ...,0,"[viewson, pa503, 3600, lumen, svga, hdmi, proj...",0.0,0.0


In [13]:
df["sentiment_textblob"] = (df.polarity > 0).astype(int)
df.head()

Unnamed: 0,qid,question_text,target,tokens,polarity,subjectivity,sentiment_textblob
121958,17df11bf9b1888d64add,I feel something missing in my relationship. I...,0,"[feel, someth, miss, relationship, ., dont, kn...",-0.253125,0.440625,0
671155,837181f0407fd112a05c,"What do Socrates, Thomas Kuhn and Karl Popper ...",0,"[socrat, ,, thoma, kuhn, karl, popper, common, ?]",-0.3,0.5,0
131932,19d09fe4bbabf16eb347,What song played in the movie of the gifted wh...,0,"[song, play, movi, gift, mr, bern, carson, lit...",0.15625,0.75,1
259178,32b9bf37edd829ffe658,What are the biggest myths about Adolf Hitler?,0,"[biggest, myth, adolf, hitler, ?]",0.0,0.0,0
110268,1595cbbad9c20d3d791a,How can the ViewSonic PA503S 3600 lumens SVGA ...,0,"[viewson, pa503, 3600, lumen, svga, hdmi, proj...",0.0,0.0,0


In [14]:
# unsupervised predictions with textblob
# 12% on insincere
print(classification_report(df["target"], df["sentiment_textblob"]))

              precision    recall  f1-score   support

           0       0.94      0.64      0.77     14002
           1       0.07      0.39      0.12       912

    accuracy                           0.63     14914
   macro avg       0.50      0.52      0.44     14914
weighted avg       0.89      0.63      0.73     14914



In [15]:
## Build initial X and y
X = df.tokens
y = df.target.to_numpy()
X.shape

(14914,)

### Splitter avant sur cette étape pour ne pas introduire de Data Leakage !

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_test.shape: {y_test.shape}')

X_train.shape: (11931,)
X_test.shape: (2983,)
y_train.shape: (11931,)
y_test.shape: (2983,)


In [17]:
X_train[:10]

434368               [approach, beauti, women, street, ,, ?]
671088                        [long, induct, light, last, ?]
219073     [data, scientist, learn, write, front-end, cod...
647534                  [revers, irrevers, chang, differ, ?]
331999         [n't, germani, occupi, end, world, war, 1, ?]
66197      [respons, salari, cse, student, recruit, gate,...
1059578    ['s, best, thing, 've, ever, experienc, around...
104555                     [much, freight, forward, make, ?]
456610     [1984, version, ``, red, dawn, ,, '', scenario...
839665                                           [hernia, ?]
Name: tokens, dtype: object

In [18]:
# TODO : isolate the insincere reviews in a variable
##tokenized_insincere_reviews = df[df['target']==1].tokens
tokenized_insincere_reviews = X_train[y_train==1]
print(tokenized_insincere_reviews.shape)
tokenized_insincere_reviews

(730,)


104555                     [much, freight, forward, make, ?]
1126998    [malaysian, incompet, mani, job, like, restaur...
870793                                  [jew, scare, dog, ?]
464535     [liber, need, day, school, safe, space, color,...
1211970                    [beat, kany, west, 's, stupid, ?]
                                 ...                        
301454     [mani, notic, speci, speci, get, extinct, surr...
306024     [alway, clash, hindu, muslim, religion, like, ...
823622     [donald, trump, like, die, impeach, novemb, 20...
679706       [arab, ,, black, hispan, stereotyp, violent, ?]
1167498    [dumb, thought, doctor, told, get, extra, fri,...
Name: tokens, Length: 730, dtype: object

# Use a word embedding (Glove) to create your corpus and run your model

In [19]:
glove_filename = 'Quora/glove/glove.6B.50d.txt'
model = KeyedVectors.load_word2vec_format(glove_filename, binary=False, no_header=True)


In [20]:
model.get_vector('king')

array([ 0.50451 ,  0.68607 , -0.59517 , -0.022801,  0.60046 , -0.13498 ,
       -0.08813 ,  0.47377 , -0.61798 , -0.31012 , -0.076666,  1.493   ,
       -0.034189, -0.98173 ,  0.68229 ,  0.81722 , -0.51874 , -0.31503 ,
       -0.55809 ,  0.66421 ,  0.1961  , -0.13495 , -0.11476 , -0.30344 ,
        0.41177 , -2.223   , -1.0756  , -1.0783  , -0.34354 ,  0.33505 ,
        1.9927  , -0.04234 , -0.64319 ,  0.71125 ,  0.49159 ,  0.16754 ,
        0.34344 , -0.25663 , -0.8523  ,  0.1661  ,  0.40102 ,  1.1685  ,
       -1.0137  , -0.21585 , -0.15155 ,  0.78321 , -0.91241 , -1.6106  ,
       -0.64426 , -0.51042 ], dtype=float32)

### create new reviews for train corpus from insincere reviews on NN flagged tokens 

In [21]:
# TODO : add the POS-tag to all spam tokens
from nltk.tag import pos_tag

pos_tag_insincere = tokenized_insincere_reviews.apply(nltk.pos_tag)
pos_tag_insincere

104555     [(much, JJ), (freight, NN), (forward, RB), (ma...
1126998    [(malaysian, JJ), (incompet, NN), (mani, JJ), ...
870793           [(jew, NN), (scare, NN), (dog, NN), (?, .)]
464535     [(liber, NNS), (need, VBP), (day, NN), (school...
1211970    [(beat, NN), (kany, JJ), (west, NN), ('s, POS)...
                                 ...                        
301454     [(mani, NN), (notic, JJ), (speci, NN), (speci,...
306024     [(alway, RB), (clash, JJ), (hindu, NN), (musli...
823622     [(donald, JJ), (trump, NN), (like, IN), (die, ...
679706     [(arab, NN), (,, ,), (black, JJ), (hispan, NN)...
1167498    [(dumb, JJ), (thought, VBD), (doctor, NN), (to...
Name: tokens, Length: 730, dtype: object

### voir pourquoi les noms propres sont aussi transformés !!

In [22]:
## same as in Exo-1
# replace token with the top 1 most similar word if 2 conditions are met:
# the POS-tag == 'NN' and the token has an embedding.
## Pb avec l'error: AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
vocab = model.key_to_index
vocab
def replace_to_synonyme(tagged_tokens):
    ## issue with model.vocab
    tokens = [t_pos[0] if t_pos[1]!='NN' or t_pos[0] not in vocab
              else model.most_similar(t_pos[0], topn = 1)[0][0] for t_pos in tagged_tokens]
    return tokens

synonymes_tokens = pos_tag_insincere.apply(replace_to_synonyme)
synonymes_tokens

104555                        [much, rail, forward, make, ?]
1126998    [malaysian, incompet, mani, getting, like, res...
870793                                [jews, scares, cat, ?]
464535     [liber, need, days, college, safe, earth, colo...
1211970                 [beating, kany, east, 's, stupid, ?]
                                 ...                        
301454     [ratnam, notic, speci, speci, get, extinct, re...
306024     [alway, clash, hindus, muslims, beliefs, like,...
823622     [donald, casino, like, die, impeachment, novem...
679706     [muslim, ,, black, hispan, stereotyp, violence...
1167498    [dumb, thought, nurse, told, get, extra, tue, ...
Name: tokens, Length: 730, dtype: object

In [23]:
print(tokenized_insincere_reviews.iloc[150])
print(synonymes_tokens.iloc[150])

### 'virgin' -> remplacé par 'branson' !!?? (ex-patron de Virgin): PB à voir

['theist', 'explain', 'venu', 'boil', 'hellish', 'nightmar', 'mar', 'frozen', 'desert', '?', 'would', "n't", 'god', 'want', 'lot', 'worship', '?']
['technophile', 'understand', 'nedumudi', 'simmer', 'hellish', 'nightmar', 'zheye', 'dried', 'jungle', '?', 'would', "n't", 'god', 'want', 'really', 'sacred', '?']


In [24]:
# append to X new tokens from insincere labelled texts to Train set !!
# voir Exo-1
# add the newly generated insincere texts to the corpus
print(X_train.shape)
X_train = X_train.append(pd.Series(synonymes_tokens), ignore_index=True)
print(X_train.shape)
X_train

(11931,)
(12661,)


  X_train = X_train.append(pd.Series(synonymes_tokens), ignore_index=True)


0                  [approach, beauti, women, street, ,, ?]
1                           [long, induct, light, last, ?]
2        [data, scientist, learn, write, front-end, cod...
3                     [revers, irrevers, chang, differ, ?]
4            [n't, germani, occupi, end, world, war, 1, ?]
                               ...                        
12656    [ratnam, notic, speci, speci, get, extinct, re...
12657    [alway, clash, hindus, muslims, beliefs, like,...
12658    [donald, casino, like, die, impeachment, novem...
12659    [muslim, ,, black, hispan, stereotyp, violence...
12660    [dumb, thought, nurse, told, get, extra, tue, ...
Name: tokens, Length: 12661, dtype: object

In [25]:
# as in Exo-1 add new labels to your `y` variable
## all class==1 as they are new spams
y_train = pd.Series(y_train)
y_train = y_train.append(pd.Series(np.ones(len(synonymes_tokens))), ignore_index=True)
print(y_train.shape)
y_train


(12661,)


  y_train = y_train.append(pd.Series(np.ones(len(synonymes_tokens))), ignore_index=True)


0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
12656    1.0
12657    1.0
12658    1.0
12659    1.0
12660    1.0
Length: 12661, dtype: float64

In [26]:
print(len(y_train[y_train==0]))
print(len(y_train[y_train==1]))
#0.9-to-7 -> rebalancing of target 1 by a factor of 2 

11201
1460


### TF-IDF

In [27]:
# TFIDF trained on train subset
vectorizer = TfidfVectorizer(analyzer=lambda x: x)
tf_idf_train = vectorizer.fit_transform(X_train).toarray()
tf_idf_test = vectorizer.transform(X_test).toarray()

print(tf_idf_train.shape)
print(y_train.shape)

## dim features > nb rows: pas cool

(12661, 14047)
(12661,)


### Logistic Regression

In [28]:
lr = LogisticRegression(max_iter=100)
lr.fit(tf_idf_train, y_train)

# prédictions sur Train et Test
y_pred_train = lr.predict(tf_idf_train)
y_pred_test  = lr.predict(tf_idf_test)

In [29]:
# check the F1-score on the minority class
from sklearn.metrics import f1_score,recall_score,precision_score

# F1-score de Train
vect_f1_score_train = f1_score(y_train, y_pred_train, average=None)
print(f" F1-score sur Train - sincere: {vect_f1_score_train[0]}")
print(f" F1-score sur Train - insincere: {vect_f1_score_train[1]}")
# F1-score de Test
vect_f1_score_test = f1_score(y_test, y_pred_test, average=None)
print(f" F1-score sur Test - sincere: {vect_f1_score_test[0]}")
print(f" F1-score sur Test - insincere: {vect_f1_score_test[1]}")
# 10000 au départ:
# 0.079 sans ajouts des new insincere texts
# 0.21 avec ajouts des new insincere texts
# 15000 au départ:
# 0.235 avec ajouts des new insincere texts

 F1-score sur Train - sincere: 0.9584818170108625
 F1-score sur Train - insincere: 0.5238798621368784
 F1-score sur Test - sincere: 0.9703522846180677
 F1-score sur Test - insincere: 0.26724137931034486


### avec XGBoost

In [1]:
xgb_classifier = XGBClassifier(eta = 0.7)
xgb_classifier.fit(tf_idf_train, y_train)

# prédictions sur Train et Test
y_pred_train = xgb_classifier.predict(tf_idf_train)
y_pred_test  = xgb_classifier.predict(tf_idf_test)

# F-scores
vect_f1_score_train = f1_score(y_train, y_pred_train, average=None)
print(f" F1-score sur Train - sincere: {vect_f1_score_train[0]}")
print(f" F1-score sur Train - insincere: {vect_f1_score_train[1]}")
# F1-score de Test
vect_f1_score_test = f1_score(y_test, y_pred_test, average=None)
print(f" F1-score sur Test - sincere: {vect_f1_score_test[0]}")
print(f" F1-score sur Test - insincere: {vect_f1_score_test[1]}")
"""
# F1-score sur Train - sincere: 0.9812573725370266
# F1-score sur Train - insincere: 0.8337853545137544
# F1-score sur Test - sincere: 0.9672790901137358
# F1-score sur Test - insincere: 0.34385964912280703
"""

# F-scores weighted on both classes
f1_score_test = f1_score(y_test, y_pred_test, average='weighted')
print(f" F1-score sur Test: {f1_score_test}")
# F1-score sur Test: 0.9290426977329589 - mais faible sur target 1 ...



NameError: name 'XGBClassifier' is not defined

### avec sentence_transformer à la place de word_embedding

In [315]:
df.question_text

121958     I feel something missing in my relationship. I...
671155     What do Socrates, Thomas Kuhn and Karl Popper ...
131932     What song played in the movie of the gifted wh...
259178        What are the biggest myths about Adolf Hitler?
110268     How can the ViewSonic PA503S 3600 lumens SVGA ...
                                 ...                        
913107     How will I do after finish BA in political sci...
454318                   How intelligent is Marc Andreessen?
1187939    Did Kylo Ren castrate himself when he turned t...
278429     If secularists don't want religion to be invol...
1009926    Which topic on Quora has the most number of fo...
Name: question_text, Length: 14914, dtype: object

In [316]:
from sentence_transformers import SentenceTransformer

sentences = df.question_text.to_list()

model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d')
## l'arg doit être une liste
embeddings = model.encode(sentences)
embeddings

array([[-0.09074124,  0.03276177, -0.00510913, ...,  0.0103267 ,
        -0.01616477,  0.1584273 ],
       [-0.04947783,  0.10102484, -0.2364795 , ..., -0.03998633,
         0.06539483,  0.324195  ],
       [-0.1569635 ,  0.01024112, -0.02797146, ..., -0.04710345,
        -0.17310087,  0.03577788],
       ...,
       [ 0.05709595,  0.01944001,  0.1399895 , ...,  0.3851375 ,
        -0.2661875 ,  0.30452874],
       [ 0.17428964, -0.13716224,  0.1388662 , ...,  0.16437475,
        -0.08975447, -0.05332353],
       [-0.35225925,  0.011895  ,  0.234319  , ...,  0.093244  ,
        -0.0595547 , -0.36777502]], dtype=float32)

### que faire avec embeddings from sentence_transformer ??

In [317]:
embeddings.shape

(14914, 300)

### voir s'il faut aussi appliquer sentence_transformer après le split

In [None]:
type(embeddings)

numpy.ndarray

In [318]:
X = embeddings
y = df.target
X.shape

(14914, 300)

In [321]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_test.shape: {y_test.shape}')

X_train.shape: (11931, 300)
X_test.shape: (2983, 300)
y_train.shape: (11931,)
y_test.shape: (2983,)


In [323]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

# prédictions sur Train et Test
y_pred_train = lr.predict(X_train)
y_pred_test  = lr.predict(X_test)

In [324]:
# F-scores
vect_f1_score_train = f1_score(y_train, y_pred_train, average=None)
print(f" F1-score sur Train - sincere: {vect_f1_score_train[0]}")
print(f" F1-score sur Train - insincere: {vect_f1_score_train[1]}")
# F1-score de Test
vect_f1_score_test = f1_score(y_test, y_pred_test, average=None)
print(f" F1-score sur Test - sincere: {vect_f1_score_test[0]}")
print(f" F1-score sur Test - insincere: {vect_f1_score_test[1]}")
# c'est moins bien qu'avec word-embedding

 F1-score sur Train - sincere: 0.9702053298383574
 F1-score sur Train - insincere: 0.29835390946502055
 F1-score sur Test - sincere: 0.9674597620713786
 F1-score sur Test - insincere: 0.256
