In [1]:
import pandas as pd
import gensim

df = pd.read_csv("scrapped_test.csv", delimiter=",")
df.head()

Unnamed: 0,url_clean,title,description
0,https://cuisine.journaldesfemmes.fr/recette-de...,Recettes de desserts faciles et rapides,Les meilleures recettes desserts; classiques o...
1,https://droit-finances.commentcamarche.com/faq...,Salaire d'une assistante maternelle : ce qu'il...,La rémunération des assistantes maternelles (s...
2,http://premium.lefigaro.fr/,Le Figaro Premium - Actualité,Accédez à l’intégralité des articles du Figaro...
3,exclusives réservées aux abonnés pour tout sav...,,
4,https://www.commentcamarche.net/faq/8887-voir-...,Voir ses factures Free - Mon compte,Voir le tutoriel en vidéo : Télécharger l'appl...


### Analysing URLs with LDA

In [None]:
for i in range(0,len(df)):
    df.iloc[i,0] = str(df.iloc[i,0][8:]).replace("www.","").replace(".comment",". comment").replace(".com","").replace(".php","").replace(".fr","").replace(".html","").replace("."," ").replace("-"," ").replace("0","").replace("1","").replace("2","").replace("3","").replace("4","").replace("5","").replace("6","").replace("7","").replace("8","").replace("9","").strip().split("/")
    df.iloc[i,0] = str(" ".join(df.iloc[i,0]).strip().replace("     "," ").replace("  "," ").lower().replace(" artfig","")).split(" ")
df.to_csv("data2.csv")
df = pd.read_csv("data2.csv")

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

processed_docs = df['url_clean'].map(preprocess)
processed_docs

In [None]:
# Latent DIrichlet Allocation Model

from gensim import corpora
dictionary = corpora.Dictionary(processed_docs)

corpus = [dictionary.doc2bow(text) for text in processed_docs]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save(f"model{NUM_TOPICS}.gensim")

In [None]:
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
topics = ldamodel.print_topics(num_words=1)
for topic in topics:
    print(topic)

### Analysing titles and descriptions

In [None]:
# can first do tf-idf to remove unnecessary words and add to list
# then perform LDA again to find topic

In [15]:
df.iloc[1,2]

"La rémunération des assistantes maternelles (salaire horaire; indemnités d'entretien et de nourriture...) doit respecter certains montants minimum. Voici un point sur les droits de l'assistante maternelle et les obligations des parents en... "

In [17]:
df.iloc[1,1]

"Salaire d'une assistante maternelle : ce qu'il faut savoir"

In [3]:
## tokenizing data with tf-idf
from keras.preprocessing.text import Tokenizer

train_sentences = df["description"].dropna()

vocab_size = 15000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_matrix(train_sentences, mode='tfidf')
#x_test = tokenizer.texts_to_matrix(test_sentences, mode='tfidf')

x_train

Using TensorFlow backend.


array([[0.        , 0.        , 0.91629073, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.3996885 , 1.55141507, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.92293899, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 1.97269842, 1.92293899, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.3996885 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.73487781, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [5]:
# Tokenizing and Removing stopwords
import nltk
from nltk import word_tokenize
# nltk.download('stopwords')
from nltk.corpus import stopwords

df1 = df.dropna()

for i in range(0,len(df)):
    sentence = df1.iloc[i,2]
    tokens = word_tokenize(sentence)
    print("before: ", tokens)
    stop_words = set(stopwords.words('french'))
    new_tokens = [w for w in tokens if not w in stop_words]
    print("after removing stop words: ", new_tokens)

before:  ['Les', 'meilleures', 'recettes', 'desserts', ';', 'classiques', 'ou', 'originaux', ';', 'à', 'réaliser', 'rapidement', 'et', 'facilement', 'à', 'la', 'maison', '.']
after removing stop words:  ['Les', 'meilleures', 'recettes', 'desserts', ';', 'classiques', 'originaux', ';', 'réaliser', 'rapidement', 'facilement', 'maison', '.']
before:  ['La', 'rémunération', 'des', 'assistantes', 'maternelles', '(', 'salaire', 'horaire', ';', 'indemnités', "d'entretien", 'et', 'de', 'nourriture', '...', ')', 'doit', 'respecter', 'certains', 'montants', 'minimum', '.', 'Voici', 'un', 'point', 'sur', 'les', 'droits', 'de', "l'assistante", 'maternelle', 'et', 'les', 'obligations', 'des', 'parents', 'en', '...']
after removing stop words:  ['La', 'rémunération', 'assistantes', 'maternelles', '(', 'salaire', 'horaire', ';', 'indemnités', "d'entretien", 'nourriture', '...', ')', 'doit', 'respecter', 'certains', 'montants', 'minimum', '.', 'Voici', 'point', 'les', 'droits', "l'assistante", 'matern

IndexError: single positional indexer is out-of-bounds

In [6]:
#Counter Vectorization

from sklearn.feature_extraction.text import CountVectorizer

texts = list(df["description"].dropna())
cv = CountVectorizer()

# tokenize and build vocab
cv.fit(texts)

# encode document
vector = cv.transform(texts)

print(cv.vocabulary_)
print(vector.shape)
print(type(vector))
print(vector.toarray())

# both in the same time
cv1 = CountVectorizer()
cv_fit = cv1.fit_transform(texts)
print(cv1.get_feature_names())
print(len(cv1.get_feature_names()))
print(cv_fit.toarray())

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vc_tf_idf = TfidfVectorizer()
# tokenize and build vocab
vc_tf_idf.fit(texts)
# encode document
vector = vc_tf_idf.transform([texts[0]])

print(vc_tf_idf.vocabulary_)
print(vc_tf_idf.idf_)
print(vector.shape)
print(vector.toarray())

{'les': 85, 'meilleures': 93, 'recettes': 133, 'desserts': 49, 'classiques': 34, 'ou': 108, 'originaux': 107, 'réaliser': 142, 'rapidement': 132, 'et': 66, 'facilement': 68, 'la': 83, 'maison': 88, 'rémunération': 144, 'des': 48, 'assistantes': 19, 'maternelles': 90, 'salaire': 148, 'horaire': 73, 'indemnités': 76, 'entretien': 63, 'de': 44, 'nourriture': 101, 'doit': 51, 'respecter': 138, 'certains': 31, 'montants': 98, 'minimum': 95, 'voici': 173, 'un': 169, 'point': 118, 'sur': 158, 'droits': 53, 'assistante': 18, 'maternelle': 89, 'obligations': 104, 'parents': 110, 'en': 61, 'accédez': 7, 'intégralité': 78, 'articles': 17, 'du': 54, 'figaro': 69, 'illimité': 74, 'politique': 119, 'économie': 178, 'culture': 42, 'international': 77, 'retrouvez': 140, 'analyses': 13, 'investigations': 79, 'enquêtes': 62, 'voir': 174, 'le': 84, 'tutoriel': 165, 'vidéo': 172, 'télécharger': 167, 'application': 15, 'free': 70, 'mon': 96, 'compte': 36, 'pour': 121, 'iphone': 80, 'puis': 127, 'saisir': 1

In [None]:
## Word2vec Embedding

In [12]:
from gensim.models.word2vec import Word2Vec
import numpy as np

df2 = pd.read_csv("data2.csv")
X_train = df2["description"].dropna()

#Initialize model_w2v and build vocabularies
#should use the whole dataset, not just training set
emb_size = 128
model_w2v = Word2Vec(size=emb_size, min_count=5)
model_w2v.build_vocab(X_train)
model_w2v.train(X_train, total_examples=model_w2v.corpus_count, epochs=2000)
model_w2v_path = './model_w2v_urls.bin' # save the model_w2v
model_w2v.save(model_w2v_path)
print("training w2v: done")

training w2v: done


In [13]:
# reload the trained model_w2v
new_model = Word2Vec.load(model_w2v_path)
X = new_model[new_model.wv.vocab] # get words

#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def build_word2vec_from_text(model_w2v, sentence, emb_size):
    emb_vec = np.zeros(emb_size).reshape((1, emb_size))
    count = 0.
    for word in sentence:
        try:
            emb_vec += model_w2v[word].reshape((1, emb_size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        emb_vec /= count
    return emb_vec

X_train = np.concatenate([build_word2vec_from_text(model_w2v, d, emb_size) for d in X_train])

  This is separate from the ipykernel package so we can avoid doing imports until
  # This is added back by InteractiveShellApp.init_path()


In [14]:
X_train

array([[-0.13549828,  0.13148418, -0.09423224, ..., -0.03559573,
         0.28165428,  0.14442378],
       [-0.14830357,  0.11263042, -0.14316868, ..., -0.03900621,
         0.30740614,  0.12885911],
       [-0.16314488,  0.16973354, -0.10512566, ..., -0.05694924,
         0.31077937,  0.18561209],
       ...,
       [-0.13354129,  0.11853259, -0.07228647, ..., -0.0684135 ,
         0.25152667,  0.14634185],
       [-0.17379498,  0.12090287,  0.02751243, ..., -0.20319484,
         0.28882187,  0.20464262],
       [-0.14928744,  0.13425103, -0.10519732, ..., -0.06861092,
         0.23458761,  0.17386866]])

In [None]:
## Vectorizing with TF-IDF and Clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df2 = pd.read_csv("data2.csv")
texts = list(df2["description"])
NBR_CLUSTER_TEST = 50

vc_tf_idf = TfidfVectorizer(max_df=0.95, min_df=2,max_features=128)
vc_tf_idf.fit(texts)
vect_array = []
flat_array = []
wcss = []
for k in range(0,len(texts)):
    vector = vc_tf_idf.transform([texts[k]])
    vect_array.append(vector.toarray())
flat_array = [item for sublist in vect_array for item in sublist]

for i in range(1, NBR_CLUSTER_TEST):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(flat_array)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1, NBR_CLUSTER_TEST), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()