In [2]:
# Import des librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data_T0 = pd.read_csv("QueryResults.csv")
print(data_T0.shape)

(30000, 8)


In [4]:
# Select our tags, represented as strings, and transform them into arrays of tags
tags = data_T0["Tags"]
clean_tags = tags.str.split("><").apply(
    lambda x: [a.strip("<").strip(">") for a in x])

# Use pandas' get_dummies to get dummy values 
# select only tags that appear over 500 times
tag_columns = pd.get_dummies(clean_tags.apply(pd.Series).stack()).groupby(level=0).sum()
all_tags = tag_columns.copy().astype(bool).sum(axis=0).sort_values(ascending=False)
top_tags = all_tags[all_tags > 50000//100]
top_tag_columns = tag_columns[top_tags.index]

data_T1 = pd.concat([data_T0, top_tag_columns], axis=1)
data_T1.head()

MemoryError: Unable to allocate 2.56 GiB for an array with shape (30000, 11434) and data type uint64

In [None]:
data_T1.describe()

In [None]:
# Tokenizer
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

def bracket_exterminator(sentence):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in sentence:
        if i == '<':
            skip1c += 1
        elif i == '>' and skip1c > 0:
            skip1c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

def code_exterminator(sentence):
    idx1 = sentence.find("<code>")
    idx2 = sentence.find("<\code>")
    res = sentence[:idx1] +  sentence[:idx2 + len("<\code>") + 1]
    return res

def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').replace('#', ' ').replace('. ', ' ').replace('=', ' ').replace('(', ' ').replace(')', ' ').replace('{', ' ').replace('}', ' ').replace(',', ' ').replace(';', ' ').replace('.', ' ')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', ':', '?', '(', ')','{','}']

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@")) 
    #                                   and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer

def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w


# Fonction de préparation du texte pour le bag of words avec lemmatization
def transform_bow_lem_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text

data_T1['Body'] = data_T1['Body'].apply(lambda x : code_exterminator(x))
data_T1['Text'] = data_T1['Title'] + " " + data_T1['Body']
data_T1['Text'] = data_T1['Text'].apply(lambda x : bracket_exterminator(x))
data_T1['sentence_bow_lem'] = data_T1['Text'].apply(lambda x : transform_bow_lem_fct(x))
data_T1.shape

In [None]:
import time

# Calcul Tsne, détermination des clusters et calcul ARI entre vrais catégorie et n° de clusters
def ARI_fct(features) :
    time1 = time.time()
    num_labels=len(l_cat)
    tsne = manifold.TSNE(n_components=2, perplexity=50, max_iter=2000, 
                                 init='random', learning_rate=100, random_state=42)
    X_tsne = tsne.fit_transform(features)
    
    # Détermination des clusters à partir des données après Tsne 
    cls = cluster.KMeans(n_clusters=num_labels, n_init=100, random_state=42)
    cls.fit(X_tsne)
    ARI = np.round(metrics.adjusted_rand_score(y_cat_num, cls.labels_),4)
    time2 = np.round(time.time() - time1,0)
    print("ARI : ", ARI, "time : ", time2)
    
    return ARI, X_tsne, cls.labels_


# visualisation du Tsne selon les vraies catégories et selon les clusters
def TSNE_visu_fct(X_tsne, y_cat_num, labels, ARI) :
    fig = plt.figure(figsize=(15,6))
    
    ax = fig.add_subplot(121)
    scatter = ax.scatter(X_tsne[:,0],X_tsne[:,1], c=y_cat_num, cmap='Set1')
    ax.legend(handles=scatter.legend_elements()[0], labels=l_cat, loc="best", title="Categorie")
    plt.title('Représentation des tweets par catégories réelles')
    
    ax = fig.add_subplot(122)
    scatter = ax.scatter(X_tsne[:,0],X_tsne[:,1], c=labels, cmap='Set1')
    ax.legend(handles=scatter.legend_elements()[0], labels=set(labels), loc="best", title="Clusters")
    plt.title('Représentation des tweets par clusters')
    
    plt.show()
    print("ARI : ", ARI)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2



cvect = CountVectorizer(stop_words='english', max_df=0.95, min_df=1)
ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=1)

feat = 'sentence_bow_lem'
cv_fit = cvect.fit(data_T1[feat])
ctf_fit = ctf.fit(data_T1[feat])

cv_transform = cvect.transform(data_T1[feat])  
ctf_transform = ctf.transform(data_T1[feat])  


In [None]:
from sklearn import manifold

tsne = manifold.TSNE(n_components=2, perplexity=50, max_iter=2000, init='random', learning_rate=100, random_state=42)
X_tsne = tsne.fit_transform(cv_transform)

fig = plt.figure(figsize=(15,6))
    
ax = fig.add_subplot(121)
scatter = ax.scatter(X_tsne[:,0],X_tsne[:,1], cmap='Set1')
ax.legend(handles=scatter.legend_elements()[0], loc="best", title="Categorie")
plt.title('Représentation des tweets par catégories réelles')

In [None]:

print("CountVectorizer : ")
print("-----------------")
ARI, X_tsne, labels = ARI_fct(cv_transform)
print()
print("Tf-idf : ")
print("--------")
ARI, X_tsne, labels = ARI_fct(ctf_transform)


In [None]:
 TSNE_visu_fct(X_tsne, y_cat_num, labels, ARI)