# I. PREPARATION DES DONNEES

In [65]:
import pandas as pd

# Lire le fichier CSV original
#data=pd.read_excel("repertoire_entreprise_2022.xlsx", dtype = {'SYSCOA1': str, 'NAEMAS': str}, sheet_name=0)
#data['SYSCOA1'] = data['SYSCOA1'].astype('category')
#data['NAEMAS'] = data['NAEMAS'].astype('category')

In [66]:
#data.head(10)  # Affiche les 10 premiers lignes du DataFrame

In [67]:
#data.info()   # Résumé du DataFrame, incluant le nombre de lignes, de colonnes, et le type de données de chaque colonne

In [68]:
#data.shape # Affiche la taille du DataFrame en fonction du nombre de ligne et de colonne

In [69]:
#print(data.columns) # Affiche la liste des colonnes dans le jeu de données

In [70]:
import matplotlib.pyplot as plt
import seaborn as sns

#plt.figure(figsize=(6, 4))
#sns.countplot(x='NAEMAS', data=data)
#plt.title('Distribution des classes')
#plt.show()

In [91]:
# Selection des variables en jeu
#df=data[["DESIGNATION_PRECISE_ACTIVITE","RAISON_SOCIALE","SYSCOA1", "NAEMAS"]]
#df.head(10)

In [92]:
# Détection des lignes manquantes 
#df.isnull().sum()

In [90]:
# Suppression des lignes manquantes
#df= df.dropna()

# Vérification 
#df.isnull().sum()

In [89]:
#df.shape

In [19]:
# On melange le jeu de donnée

from sklearn.utils import shuffle

# Utiliser la fonction shuffle() pour mélanger les lignes du DataFrame
df = shuffle(df, random_state=42)

# Afficher le DataFrame mélangé
df.head(10)

Unnamed: 0,DESIGNATION_PRECISE_ACTIVITE,RAISON_SOCIALE,SYSCOA1,NAEMAS
3300,BAR - RESTAURANT,LE TACOMA - SARL,33002,P00020
4433,ACTIVITES POUR LA SANTE DES HOMMES,CRE (CABINET DE RADIOLOGIE ET D'ECHOGRAPHIE),41001,X00010
13808,COMMERCE,ETS AMADOU DIONGUE FALL,31003,N00030
3256,BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR,IGUANE CAFE - CUBAN BAR (CHRISTIAN J. E. BRIAN...,33002,P00020
7879,NETTOYAGE GENERAL PROFESSIONNEL,ENTREPRISE SERIGNE FALLOU SUARL,38003,U00040
7004,PEINTRE - ETANCHEITE,EPE (ENTREPRISE DE PEINTURE ET D'ETACHEITE) JE...,30002,M00030
24394,CONSEIL EN INFORMATIQUE,3TI RESEARCH SENEGAL - SARL,38002,Q00050
10614,MENUISERIE METALLIQUE,ENTREPRISE GENERAL DIALLO ET FRERES,24002,N00030
7066,VENTES DE VEHICULES,TATA AFRICA SENEGAL SARL,31001,N00010
23788,RESTAURATION,LE GLACIER DE LA PETITE COETE SUARL,33002,P00020


# II. TECHNIQUES CLASSIQUES

## II.1 Jeu de données avec "DESIGNATION_PRECISE_ACTIVITE" comme variable explicative

In [88]:
# Selection des variables en jeu
#df1=df[["DESIGNATION_PRECISE_ACTIVITE","NAEMAS"]]
#df1.head(10)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# vectorisation des descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df1['DESIGNATION_PRECISE_ACTIVITE'])

y = df['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
          }
# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.680654   0.690754  0.680654  0.650115
Support Vector Machine  0.682066   0.689243  0.682066  0.656035
Random Forest           0.676215   0.664568  0.676215  0.657116
Naive Bayes             0.623563   0.669032  0.623563  0.558086


## II.2 Jeu de données avec "RAISON_SOCIALE" comme variable explicative

In [87]:
# Selection des variables en jeu
#df2=df[["RAISON_SOCIALE","NAEMAS"]]
#df2.head(10)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Convertir les valeurs numériques en chaînes de caractères
#df2['SYSCOA1'] = df2['SYSCOA1'].astype(str)

# vectorisation des descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df2['RAISON_SOCIALE'])

y = df2['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
          }
# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.458947   0.580415  0.458947  0.393517
Support Vector Machine  0.456123   0.585043  0.456123  0.388387
Random Forest           0.449869   0.450949  0.449869  0.401761
Naive Bayes             0.321565   0.712694  0.321565  0.202442


## II.3 Jeu de données avec "SYSCOA1" comme variable explicative

In [86]:
# Selection des variables en jeu
#df3=df[["SYSCOA1","NAEMAS"]]
#df3.head(10)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# vectorisation des descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df3['SYSCOA1'])

y = df3['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
          }
# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.680855   0.708136  0.680855  0.622434
Support Vector Machine  0.681461   0.722483  0.681461  0.623712
Random Forest           0.682066   0.722538  0.682066  0.624798
Naive Bayes             0.671979   0.649923  0.671979  0.607651


## II.5 Jeu de données avec DESIGNATION_PRECISE_ACTIVITE et RAISON_SOCIALE comme variables explicatives

In [85]:
# Selection des variables en jeu
#df5=df[["DESIGNATION_PRECISE_ACTIVITE","RAISON_SOCIALE","NAEMAS"]]
#df5.head(10)

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prétraitement des colonnes de texte en utilisant TF-IDF
vectorizer1 = TfidfVectorizer()
X_texte1 = vectorizer1.fit_transform(df5['DESIGNATION_PRECISE_ACTIVITE'])

vectorizer2 = TfidfVectorizer()
X_texte2 = vectorizer2.fit_transform(df5['RAISON_SOCIALE'])

# Prétraitement de la colonne numérique
#scaler = StandardScaler()
#X_numerique = df1['SYSCOA1']

# Combinaison des caractéristiques traitées
from scipy.sparse import hstack
X = hstack([X_texte1, X_texte2])
# Séparation des caractéristiques et de la cible
y = df5['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
    }

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.687513   0.693294  0.687513  0.659401
Support Vector Machine  0.674803   0.693514  0.674803  0.639782
Random Forest           0.688723   0.678948  0.688723  0.658652
Naive Bayes             0.588662   0.710284  0.588662  0.516820


## II.6 Jeu de données avec RAISON_SOCIALE et SYSCOA1 comme variables explicatives

In [84]:
# Selection des variables en jeu
#df6=df[["RAISON_SOCIALE","SYSCOA1","NAEMAS"]]
#df6.head(10)

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Appliquer le TfidfVectorizer
#vectorizer2 = TfidfVectorizer()
#X_texte2 = vectorizer2.fit_transform(df6['SYSCOA1'])

#print(X_texte2)

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prétraitement des colonnes de texte en utilisant TF-IDF

vectorizer1 = TfidfVectorizer()
X_texte1 = vectorizer1.fit_transform(df6['RAISON_SOCIALE'])

vectorizer2 = TfidfVectorizer()
X_texte2 = vectorizer2.fit_transform(df6['SYSCOA1'])

# Prétraitement de la colonne numérique
#scaler = StandardScaler()
#X_numerique = df1['SYSCOA1']

# Combinaison des caractéristiques traitées
from scipy.sparse import hstack
X = hstack([X_texte1, X_texte2])
# Séparation des caractéristiques et de la cible
y = df6['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
    }

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.718378   0.711067  0.718378  0.694181
Support Vector Machine  0.717168   0.743711  0.717168  0.688591
Random Forest           0.702643   0.691289  0.702643  0.683751
Naive Bayes             0.610652   0.665394  0.610652  0.539211


# II.7 Jeu de données avec DESIGNATION_PRECISE_ACTIVITE, SYSCOA1 comme variables explicatives

In [83]:
# Selection des variables en jeu
#df7=df[["DESIGNATION_PRECISE_ACTIVITE","SYSCOA1","NAEMAS"]]
#df7.head(10)

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Prétraitement des colonnes de texte en utilisant TF-IDF

vectorizer1 = TfidfVectorizer()
X_texte1 = vectorizer1.fit_transform(df7['DESIGNATION_PRECISE_ACTIVITE'])


vectorizer2 = TfidfVectorizer()
X_texte2 = vectorizer2.fit_transform(df7['SYSCOA1'])

# Prétraitement de la colonne numérique
#scaler = StandardScaler()
#X_numerique = df1['SYSCOA1']

# Combinaison des caractéristiques traitées
from scipy.sparse import hstack
X = hstack([X_texte1, X_texte2])
# Séparation des caractéristiques et de la cible
y = df7['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
    }

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.736131   0.732563  0.736131  0.717525
Support Vector Machine  0.736534   0.735122  0.736534  0.717603
Random Forest           0.723623   0.715775  0.723623  0.708298
Naive Bayes             0.705669   0.707184  0.705669  0.667053


## II.8 Jeu de données avec DESIGNATION_PRECISE_ACTIVITE, RAISON_SOCIALE et SYSCOA1 comme variables explicatives

In [82]:
#df8=df[["DESIGNATION_PRECISE_ACTIVITE","RAISON_SOCIALE","SYSCOA1","NAEMAS"]]
#df8.head(10)

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prétraitement des colonnes de texte en utilisant TF-IDF
vectorizer1 = TfidfVectorizer()
X_texte1 = vectorizer1.fit_transform(df8['DESIGNATION_PRECISE_ACTIVITE'])

vectorizer2 = TfidfVectorizer()
X_texte2 = vectorizer2.fit_transform(df8['RAISON_SOCIALE'])

# Conversion de "SYSCOA1" en chaîne de caractères et transformation TF-IDF
vectorizer3 = TfidfVectorizer()
X_texte3 = vectorizer3.fit_transform(df8['SYSCOA1'].astype('str'))

# Combinaison des caractéristiques traitées

X = hstack([X_texte1, X_texte2,X_texte3])

# Séparation des caractéristiques et de la cible
y = df8['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs'),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
    }

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.741981   0.735427  0.741981  0.723665
Support Vector Machine  0.737543   0.737745  0.737543  0.714522
Random Forest           0.733710   0.722745  0.733710  0.710667
Naive Bayes             0.680654   0.706115  0.680654  0.626532


# II. METHODE DE COMPARAISON DES VECTEURS DE MOTS

In [35]:
import nltk
from nltk.corpus import stopwords

# Télécharger les stop words de NLTK (à faire une seule fois)
nltk.download('stopwords')

# Récupérer la liste des stop words en français
stop_words = set(stopwords.words('french'))

# Fonction pour filtrer les stop words dans une chaîne de caractères

def remove_stopwords(text, stop_words):
    
    words = text.split()  # Tokenisation
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)  # Reconstituer la chaîne de caractères

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# Vérification de la fonction remove_stopwords(text, stop_words)

text1="Je suis un Sénégalais gentil, humble et avec beaucoup de caractère."

remove_stopwords(text1, stop_words)

'Sénégalais gentil, humble beaucoup caractère.'

In [81]:
# installation de wordcloud dans l'environnement
#!pip install wordcloud

In [38]:
df4=df[["DESIGNATION_PRECISE_ACTIVITE","SYSCOA1","NAEMAS"]]
df4[df4["NAEMAS"] == 'Q00050']

Unnamed: 0,DESIGNATION_PRECISE_ACTIVITE,SYSCOA1,NAEMAS
24394,CONSEIL EN INFORMATIQUE,038002,Q00050
15706,SERVICES INFORMATIQUES,038002,Q00050
17052,PROGRAMMATION INFORMATIQUE,038002,Q00050
8374,SERVICES INFORMATIQUES ET DE TRAITEMENT DE DON...,038002,Q00050
11729,SECURITE ELECTRONIQUE,038002,Q00050
...,...,...,...
20821,COMMERCE ALIMENTS VOLAIL ET BETAIL,031003,Q00050
20397,CONSEILS ET AUTRES ACTIVITES INFORMATIQUES,038002,Q00050
25092,PROGRAMMATION INFORMATIQUE,038002,Q00050
15094,NETTOYAGE INDUSTRIEL,038003,Q00050


In [39]:

# Récupération du code 'NAEMAS' sous forme de liste
Code = list(df4['NAEMAS'].unique())

# Initialiser un dictionnaire vide pour stocker les résultats
Dicto_ = {}

# Regroupement des activités ayant le même code NAEMAS
for k in Code:
     desc = df4[df4['NAEMAS'] == k]['DESIGNATION_PRECISE_ACTIVITE'].tolist()

    # Appliquer la fonction remove_stopwords() à chaque chaîne de caractères pour chaque liste du dictionnaire

     filtered_list_of_strings = [remove_stopwords(sentence, stop_words) for sentence in desc]
    
    # Associer la liste filtered_list_of_strings au clé correspondant dans le dictionnaire
     Dicto_[k] = filtered_list_of_strings 


In [80]:
# Afficher le résultat
#print(Dicto_)

## II.1. Vecteurs redondants

In [41]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Liste des scores

Scores = []

# Liste des vecteurs
vecteurs = []

# Remplir la liste des vecteurs avec les activités combinées
for k, activities in Dicto_.items():
    vect_k = ' '.join(activities)
    vecteurs.append(vect_k)

# Utiliser le TfidfVectorizer une seule fois pour tous les textes
vectorizer = TfidfVectorizer()

# Pour chaque désignation dans le DataFrame
for row in df4['DESIGNATION_PRECISE_ACTIVITE']:
    # Convertir les vecteurs en une matrice TF-IDF
    vecteurs_matrice = vectorizer.fit_transform(vecteurs + [row])
    
    # Calculer la similarité cosinus
    similarites = cosine_similarity(vecteurs_matrice)
    
    # Extraire les similarités du vecteur à comparer
    similarites_vecteur_a_comparer = similarites[-1][:-1]
    
    # Stocker les scores dans la liste
    Scores.append(similarites_vecteur_a_comparer)

# Convertir la liste des scores en DataFrame
similarity_df = pd.DataFrame(Scores, index=df4['DESIGNATION_PRECISE_ACTIVITE'])

# Utiliser les codes NAEMAS comme noms de colonnes
similarity_df.columns = [f'score_{code}' for code in Dicto_.keys()]

# Afficher le DataFrame de similarité
similarity_df.head(10)


Unnamed: 0_level_0,score_P00020,score_X00010,score_N00030,score_U00040,score_M00030,score_Q00050,score_N00010,score_T00030,score_P00010,score_O00050,...,score_E00010,score_Z00000,score_F00010,score_A00020,score_k00010,score_J00180,score_A00050,score_C00020,score_J00010,score_B00040
DESIGNATION_PRECISE_ACTIVITE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BAR - RESTAURANT,0.724888,0.0,0.001326,0.0,0.0,0.0,0.0,0.0,0.234588,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTIVITES POUR LA SANTE DES HOMMES,0.000279,0.409609,0.00358,0.022827,0.011426,0.035151,0.001093,0.024169,0.005305,0.008176,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
COMMERCE,0.013131,0.005,0.631783,0.06802,0.036956,0.043159,0.302889,0.042475,0.006929,0.023295,...,0.0,0.130177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR,0.237453,0.0,0.0007,0.001495,0.0,0.0,0.0,0.0,0.113922,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NETTOYAGE GENERAL PROFESSIONNEL,0.003625,0.002128,0.102825,0.060735,0.011158,0.005104,0.024844,0.024177,0.000861,0.00217,...,0.0,0.256252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PEINTRE - ETANCHEITE,0.0,0.0,0.0,0.0,0.017344,0.0,0.006135,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONSEIL EN INFORMATIQUE,0.001202,0.000856,0.009745,0.02246,0.0,0.236728,0.002443,0.121046,0.0,0.003738,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MENUISERIE METALLIQUE,0.0,0.0,0.001822,0.002214,0.028089,0.0,0.00453,0.001248,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VENTES DE VEHICULES,0.0,0.000608,0.008128,0.002333,0.0,0.004198,0.230084,0.000478,0.002106,0.005694,...,0.0,0.0,0.042061,0.0,0.0,0.162927,0.0,0.0,0.0,0.0
RESTAURATION,0.55594,0.0,0.003507,0.005581,0.0,0.0,0.0,0.001662,0.153819,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# Ajouter les colonnes SYSCOA1 et NAEMAS
#similarity_df['SYSCOA1'] = df4['SYSCOA1'].values
#similarity_df['NAEMAS'] = df4['NAEMAS'].values
# Transformer la colonne SYSCOA1 en catégorie
#similarity_df['SYSCOA1'] = similarity_df['SYSCOA1'].astype('category')
# Afficher le DataFrame de similarité avec les colonnes ajoutées
#similarity_df

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Séparation des caractéristiques et de la cible
X = similarity_df.iloc[:, :-1]
y = similarity_df['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisation des données (optionnelle pour Naive Bayes, mais nécessaire pour d'autres modèles)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    #'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.697398,0.692819,0.697398,0.677977
Support Vector Machine,0.685092,0.675754,0.685092,0.660046
Random Forest,0.703248,0.691237,0.703248,0.686028


## II.2. Vecteurs purs

In [44]:
from collections import Counter

# Fusionner toutes les listes pour compter les occurrences des mots
tous_les_mots = [mot for mots in  Dicto_.values() for mot in mots]
compte_mots = Counter(tous_les_mots)

# Identifier le mot le plus fréquent dans chaque liste
mots_a_conserver = {}
for key, liste in  Dicto_.items():
    compteur = Counter(liste)
    mot_plus_frequent = max(compteur, key=lambda mot:(compteur[mot], compte_mots[mot]))
    mots_a_conserver[mot_plus_frequent] = key

# Supprimer les mots des autres listes s'ils sont plus fréquents ailleurs
for mot, liste_a_conserver_key in mots_a_conserver.items():
    for key, liste in  Dicto_.items():
        if key != liste_a_conserver_key and mot in liste:
            liste.remove(mot)
    

In [77]:
#print("Dictionnaire modifié :")
#for key, liste in  Dicto_.items():
    #print(f"{key} :", liste)

## II.3. NUAGE DES MOTS

In [76]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Parcourir chaque clé et générer un nuage de mots pour chaque groupe

for key, activities in Dicto_.items():
    
    # Combiner les activités en une seule chaîne de texte
    text = ' '.join(activities)
    
    # Créer un objet WordCloud
    
    #wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Afficher le nuage de mots
    
    #plt.figure(figsize=(10, 5))
    #plt.imshow(wordcloud, interpolation='bilinear')
    #plt.title(f'Nuage de mots pour la clé {key}', pad=30)
    #plt.axis('off')
    #plt.show()

In [48]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dictionnaire des scores
# Dictionnaire des scores
Scores = []

# Liste des vecteurs
vecteurs = []

# Remplir la liste des vecteurs avec les activités combinées
for k, activities in Dicto_.items():
    vect_k = ' '.join(activities)
    vecteurs.append(vect_k)

# Utiliser le TfidfVectorizer une seule fois pour tous les textes
vectorizer = TfidfVectorizer()

# Pour chaque désignation dans le DataFrame
for row in df8['DESIGNATION_PRECISE_ACTIVITE']:
    # Convertir les vecteurs en une matrice TF-IDF
    vecteurs_matrice = vectorizer.fit_transform(vecteurs + [row])
    
    # Calculer la similarité cosinus
    similarites = cosine_similarity(vecteurs_matrice)
    
    # Extraire les similarités du vecteur à comparer
    similarites_vecteur_a_comparer = similarites[-1][:-1]
    
    # Stocker les scores dans la liste
    Scores.append(similarites_vecteur_a_comparer)

# Convertir la liste des scores en DataFrame
similarity_df = pd.DataFrame(Scores, index=df8['DESIGNATION_PRECISE_ACTIVITE'])

# Utiliser les codes NAEMAS comme noms de colonnes
similarity_df.columns = [f'score_{code}' for code in Dicto_.keys()]

# Afficher le DataFrame de similarité
similarity_df.head(10)


Unnamed: 0_level_0,score_P00020,score_X00010,score_N00030,score_U00040,score_M00030,score_Q00050,score_N00010,score_T00030,score_P00010,score_O00050,...,score_E00010,score_Z00000,score_F00010,score_A00020,score_k00010,score_J00180,score_A00050,score_C00020,score_J00010,score_B00040
DESIGNATION_PRECISE_ACTIVITE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BAR - RESTAURANT,0.718934,0.0,0.001294,0.0,0.0,0.0,0.0,0.0,0.225142,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTIVITES POUR LA SANTE DES HOMMES,0.000273,0.436109,0.003315,0.022157,0.010956,0.034579,0.001073,0.02432,0.00502,0.007891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
COMMERCE,0.012215,0.004485,0.650414,0.069462,0.034999,0.043196,0.315275,0.043712,0.005262,0.021794,...,0.0,0.136509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR,0.235565,0.0,0.000684,0.001462,0.0,0.0,0.0,0.0,0.109343,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NETTOYAGE GENERAL PROFESSIONNEL,0.003596,0.00201,0.10034,0.059373,0.010852,0.005044,0.024746,0.024032,0.000826,0.002053,...,0.0,0.254725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PEINTRE - ETANCHEITE,0.0,0.0,0.0,0.0,0.016869,0.0,0.006111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONSEIL EN INFORMATIQUE,0.001193,0.000809,0.00951,0.021956,0.0,0.233945,0.002433,0.120319,0.0,0.003536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MENUISERIE METALLIQUE,0.0,0.0,0.001175,0.002165,0.027319,0.0,0.004512,0.001241,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VENTES DE VEHICULES,0.0,0.000574,0.007932,0.002281,0.0,0.004149,0.229166,0.000475,0.002022,0.005387,...,0.0,0.0,0.042058,0.0,0.0,0.162927,0.0,0.0,0.0,0.0
RESTAURATION,0.565044,0.0,0.003115,0.00419,0.0,0.0,0.0,0.001691,0.147563,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# Ajouter les colonnes SYSCOA1 et NAEMAS
#similarity_df['SYSCOA1'] = df8['SYSCOA1'].values
#similarity_df['NAEMAS'] = df8['NAEMAS'].values
# Transformer la colonne SYSCOA1 en catégorie
#similarity_df['SYSCOA1'] = similarity_df['SYSCOA1'].astype('category')
# Afficher le DataFrame de similarité avec les colonnes ajoutées
#similarity_df

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Séparation des caractéristiques et de la cible
X = similarity_df.iloc[:, :-1]
y = similarity_df['NAEMAS']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisation des données (optionnelle pour Naive Bayes, mais nécessaire pour d'autres modèles)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    #'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     0.695784   0.690310  0.695784  0.676185
Support Vector Machine  0.683881   0.675245  0.683881  0.658997
Random Forest           0.702441   0.693765  0.702441  0.685327


- Se limiter au 3 premier chiffre du code NAMEAS pour faire un regroupement
- voir la possibilité de regrouper certaines classes minoritaires
- Optimisation des modéles avec des hyparamétres
- Utiliser les labels du sycoa à la place des codes
- Transformer les codes syscoa en catégories
- utliser les transformateurs en deep learning

# Utilisation des 3 premiers caractères du code NAMEAS pour faire un regroupement

In [74]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Supposons que df est votre DataFrame et que 'NAEMAS' et 'DESIGNATION_PRECISE_ACTIVITE' sont les colonnes pertinentes

# Extraire les 3 premiers chiffres du code NAEMAS
#df['NAEMAS_3'] = df['NAEMAS'].astype(str).str[:3]

# Regrouper les descriptions par les 3 premiers chiffres du code NAEMAS
#grouped_df = df.groupby('NAEMAS_3')['DESIGNATION_PRECISE_ACTIVITE'].apply(lambda x: ' '.join(x)).reset_index()

# Le grouped_df['NAEMAS_3'] contient les étiquettes correspondantes

# Vous pouvez accéder à ces vecteurs et leurs étiquettes comme ceci :
#naemas_labels = grouped_df['NAEMAS_3']
#print(naemas_labels)


In [52]:
# Transformer les descriptions combinées en vecteurs de mots
#vectorizer = TfidfVectorizer()
#X = vectorizer.fit_transform(grouped_df['DESIGNATION_PRECISE_ACTIVITE'])
# Maintenant X contient les vecteurs de mots
#print(X)

  (0, 2005)	0.01450903554265104
  (0, 125)	0.010890647702453314
  (0, 1294)	0.006297421338689276
  (0, 2678)	0.009034626073343198
  (0, 2604)	0.010890647702453314
  (0, 1875)	0.012907873416082093
  (0, 1373)	0.01450903554265104
  (0, 1051)	0.0077770676715594724
  (0, 3261)	0.006824730358140976
  (0, 1660)	0.0077770676715594724
  (0, 1582)	0.01450903554265104
  (0, 3437)	0.012907873416082093
  (0, 2299)	0.01450903554265104
  (0, 1371)	0.012907873416082093
  (0, 1988)	0.012907873416082093
  (0, 2413)	0.0077770676715594724
  (0, 3126)	0.01450903554265104
  (0, 577)	0.010170668681428173
  (0, 2227)	0.006297421338689276
  (0, 421)	0.01450903554265104
  (0, 1171)	0.005039862936905551
  (0, 332)	0.005039862936905551
  (0, 123)	0.01450903554265104
  (0, 2402)	0.009561935092794899
  (0, 444)	0.007117379160507743
  :	:
  (26, 3350)	0.055654800405548446
  (26, 2621)	0.05560819311885204
  (26, 1230)	0.014174900800305146
  (26, 2619)	0.03844213555849526
  (26, 2586)	0.010328245379998567
  (26, 2032

In [53]:
df9=df[["DESIGNATION_PRECISE_ACTIVITE", "SYSCOA1", "NAEMAS", "NAEMAS_3"]]
df9[df9["NAEMAS_3"] == 'Q00']

Unnamed: 0,DESIGNATION_PRECISE_ACTIVITE,SYSCOA1,NAEMAS,NAEMAS_3
24394,CONSEIL EN INFORMATIQUE,038002,Q00050,Q00
15706,SERVICES INFORMATIQUES,038002,Q00050,Q00
18094,SERVICE AUX ENTREPRISES COMMUNICATION & MEDIAS,034006,Q00060,Q00
6165,MARKETING ON LINE,038002,Q00060,Q00
17052,PROGRAMMATION INFORMATIQUE,038002,Q00050,Q00
...,...,...,...,...
17015,TELECOMMUNICATION,035002,Q00040,Q00
19302,PRESTATIONS DE SERVICES,032001,Q00040,Q00
25092,PROGRAMMATION INFORMATIQUE,038002,Q00050,Q00
15094,NETTOYAGE INDUSTRIEL,038003,Q00050,Q00


In [54]:
# Récupération du code 'NAEMAS' sous forme de liste
Code_3 = list(df9['NAEMAS_3'].unique())

# Initialiser un dictionnaire vide pour stocker les résultats
Dicto_ = {}

# Regroupement des activités ayant le même code NAEMAS
for k in Code_3:
     desc = df9[df9['NAEMAS_3'] == k]['DESIGNATION_PRECISE_ACTIVITE'].tolist()

    # Appliquer la fonction remove_stopwords() à chaque chaîne de caractères pour chaque liste du dictionnaire

     filtered_list_of_strings = [remove_stopwords(sentence, stop_words) for sentence in desc]
    
    # Associer la liste filtered_list_of_strings au clé correspondant dans le dictionnaire
     Dicto_[k] = filtered_list_of_strings 


In [55]:
# Afficher le résultat
#print(Dicto_)

{'P00': ['BAR - RESTAURANT', 'BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR', 'RESTAURATION', 'HÉBERGEMENT', 'AUBERGE - BAR - DISCOTHEQUE', 'RESTAURATION', 'TOURISTIQUE', 'BAR RESTAURANT', 'HOTELS ACTIVITES ANALOGUES', 'HOTEL ETABLISSEMENTS ANALOGUES', 'BAR - RESTAURANT', 'BAR - RESTAURANT', 'RESTAURANT PATISSERIE', 'HEBERGEMENT', 'BAR', 'RESTAURANT', 'RESTAURATION', 'HOTELLERIE', 'RESTAURANT', 'BAR - RESTAURANT', 'CENTRE PECHE SPORTIVE - BAR - RESTAURANT', 'BTP', 'HOTELELLERIE', 'RESTAURANT & BARS', 'HOTELLERIE & RESTAURATION', 'HOTELLERIE', 'HOTELLERIE', 'RESTAURANT', 'HOTELLERIE', 'RESTAURATION', 'HOTELRIE', 'HOTELLERIE RESTAURATION', 'INDUSTRIE HOTELLERIE', 'REPRESENTATION ARTISTIQUE CULTURELLE COMMERCIALE', 'HOTEL', 'ACTIVITES AGENCES IMMOBILIERES', 'RESTAURATION', 'HEBERGEMENT', 'HÔTELLERIE', 'RESTAURATION', 'HOTELLERIE', 'TOURISME HOTELLERIE', 'CAMPEMENT TOURISTIQUE', 'RESTAURANT', 'ACTIVITE RESTAURATION', 'BTP', 'CAMPEMENT TOURISTIQUE', 'AUBERGE', 'HOTELLERIE', 'HOTELLERIE - BAR - 

## III.1. Vecteurs redondants

In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Liste des scores

Scores = []

# Liste des vecteurs
vecteurs = []

# Remplir la liste des vecteurs avec les activités combinées
for k, activities in Dicto_.items():
    vect_k = ' '.join(activities)
    vecteurs.append(vect_k)

# Utiliser le TfidfVectorizer une seule fois pour tous les textes
vectorizer = TfidfVectorizer()

# Pour chaque désignation dans le DataFrame
for row in df9['DESIGNATION_PRECISE_ACTIVITE']:
    # Convertir les vecteurs en une matrice TF-IDF
    vecteurs_matrice = vectorizer.fit_transform(vecteurs + [row])
    
    # Calculer la similarité cosinus
    similarites = cosine_similarity(vecteurs_matrice)
    
    # Extraire les similarités du vecteur à comparer
    similarites_vecteur_a_comparer = similarites[-1][:-1]
    
    # Stocker les scores dans la liste
    Scores.append(similarites_vecteur_a_comparer)

# Convertir la liste des scores en DataFrame
similarity_df = pd.DataFrame(Scores, index=df9['DESIGNATION_PRECISE_ACTIVITE'])

# Utiliser les codes NAEMAS comme noms de colonnes
similarity_df.columns = [f'score_{code_3}' for code_3 in Dicto_.keys()]

# Afficher le DataFrame de similarité
similarity_df.head(10)


Unnamed: 0_level_0,score_P00,score_X00,score_N00,score_U00,score_M00,score_Q00,score_T00,score_O00,score_W00,score_J00,...,score_ZA0,score_D00,score_Y00,score_E00,score_C00,score_G00,score_H00,score_L00,score_Z00,score_k00
DESIGNATION_PRECISE_ACTIVITE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BAR - RESTAURANT,0.60065,0.0,0.000983,0.000811,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.242023,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTIVITES POUR LA SANTE DES HOMMES,0.003545,0.37429,0.003107,0.025273,0.00567,0.035333,0.027335,0.006283,0.008664,0.008201,...,0.012143,0.001229,0.061334,0.00624,0.0,0.0,0.0,0.00224,0.0,0.0
COMMERCE,0.015126,0.006032,0.691883,0.071751,0.03894,0.073747,0.051826,0.030377,0.009321,0.102728,...,0.029784,0.02743,0.044831,0.017405,0.0,0.0,0.070681,0.037485,0.151906,0.0
BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR,0.226487,0.0,0.000528,0.001199,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.165763,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NETTOYAGE GENERAL PROFESSIONNEL,0.003754,0.002446,0.11791,0.049007,0.004559,0.015437,0.023351,0.003844,0.027889,0.015341,...,0.185822,0.0,0.009643,0.0,0.0,0.0,0.0,0.027893,0.252401,0.0
PEINTRE - ETANCHEITE,0.0,0.0,0.000568,0.0,0.004138,0.0,0.0,0.0,0.0,0.006568,...,0.0,0.0,0.015484,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CONSEIL EN INFORMATIQUE,0.000769,0.000736,0.008008,0.017749,0.001127,0.176516,0.098877,0.002227,0.004335,0.004933,...,0.023059,0.0,0.010946,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MENUISERIE METALLIQUE,0.0,0.0,0.001973,0.002036,0.012594,0.0,0.001247,0.0,0.001858,0.27634,...,0.0,0.0,0.0,0.008096,0.0,0.0,0.0,0.0,0.0,0.0
VENTES DE VEHICULES,0.001172,0.000608,0.019731,0.011052,0.000302,0.005289,0.0009,0.009344,0.001326,0.012851,...,0.005002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RESTAURATION,0.434841,0.0,0.002482,0.004373,0.001004,0.005045,0.001498,0.0,0.004465,0.0,...,0.0,0.0,0.01503,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# Ajouter les colonnes SYSCOA1 et NAEMAS
#similarity_df['SYSCOA1'] = df9['SYSCOA1'].values
#similarity_df['NAEMAS_3'] = df9['NAEMAS_3'].values
# Transformer la colonne SYSCOA1 en catégorie
#similarity_df['SYSCOA1'] = similarity_df['SYSCOA1'].astype('category')
# Afficher le DataFrame de similarité avec les colonnes ajoutées
#similarity_df

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Séparation des caractéristiques et de la cible
X = similarity_df.iloc[:, :-1]
y = similarity_df['NAEMAS_3']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisation des données (optionnelle pour Naive Bayes, mais nécessaire pour d'autres modèles)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    #'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.772241,0.781299,0.772241,0.768613
Support Vector Machine,0.789994,0.796519,0.789994,0.788824
Random Forest,0.802703,0.806522,0.802703,0.801305


## III.2. VECTEURS PURS

In [69]:
from collections import Counter

# Fusionner toutes les listes pour compter les occurrences des mots
tous_les_mots = [mot for mots in  Dicto_.values() for mot in mots]
compte_mots = Counter(tous_les_mots)

# Identifier le mot le plus fréquent dans chaque liste
mots_a_conserver = {}
#for key, liste in  Dicto_.items():
  #  compteur = Counter(liste)
  #  mot_plus_frequent = max(compteur, key=lambda mot:(compteur[mot], compte_mots[mot]))
   # mots_a_conserver[mot_plus_frequent] = key

# Supprimer les mots des autres listes s'ils sont plus fréquents ailleurs
#for mot, liste_a_conserver_key in mots_a_conserver.items():
    #for key, liste in  Dicto_.items():
    #    if key != liste_a_conserver_key and mot in liste:
     #       liste.remove(mot)
    

In [70]:
#print("Dictionnaire modifié :")
#for key, liste in  Dicto_.items():
 #   print(f"{key} :", liste)

Dictionnaire modifié :
P00 : ['BAR - RESTAURANT', 'BAR - DISCOTHEQUE - ANIMATION - CUBAN BAR', 'RESTAURATION', 'HÉBERGEMENT', 'AUBERGE - BAR - DISCOTHEQUE', 'RESTAURATION', 'TOURISTIQUE', 'BAR RESTAURANT', 'HOTELS ACTIVITES ANALOGUES', 'HOTEL ETABLISSEMENTS ANALOGUES', 'BAR - RESTAURANT', 'BAR - RESTAURANT', 'RESTAURANT PATISSERIE', 'HEBERGEMENT', 'BAR', 'RESTAURANT', 'RESTAURATION', 'HOTELLERIE', 'RESTAURANT', 'BAR - RESTAURANT', 'CENTRE PECHE SPORTIVE - BAR - RESTAURANT', 'HOTELELLERIE', 'RESTAURANT & BARS', 'HOTELLERIE & RESTAURATION', 'HOTELLERIE', 'HOTELLERIE', 'RESTAURANT', 'HOTELLERIE', 'RESTAURATION', 'HOTELRIE', 'HOTELLERIE RESTAURATION', 'INDUSTRIE HOTELLERIE', 'REPRESENTATION ARTISTIQUE CULTURELLE COMMERCIALE', 'HOTEL', 'ACTIVITES AGENCES IMMOBILIERES', 'RESTAURATION', 'HEBERGEMENT', 'HÔTELLERIE', 'RESTAURATION', 'HOTELLERIE', 'TOURISME HOTELLERIE', 'CAMPEMENT TOURISTIQUE', 'RESTAURANT', 'ACTIVITE RESTAURATION', 'CAMPEMENT TOURISTIQUE', 'AUBERGE', 'HOTELLERIE', 'HOTELLERIE -

## III.3. NUAGE DES MOTS

In [72]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Parcourir chaque clé et générer un nuage de mots pour chaque groupe

for key, activities in Dicto_.items():
    
    # Combiner les activités en une seule chaîne de texte
    text = ' '.join(activities)
    
    # Créer un objet WordCloud
    
    #wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Afficher le nuage de mots
    
    #plt.figure(figsize=(10, 5))
    #plt.imshow(wordcloud, interpolation='bilinear')
    #plt.title(f'Nuage de mots pour la clé {key}', pad=30)
    #plt.axis('off')
    #plt.show()

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Séparation des caractéristiques et de la cible
X = similarity_df.iloc[:, :-1]
y = similarity_df['NAEMAS_3']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisation des données (optionnelle pour Naive Bayes, mais nécessaire pour d'autres modèles)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Liste des modèles
models = {
    'Logistic Regression': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    #'Naive Bayes': MultinomialNB()
}

# Dictionnaire pour stocker les résultats
results = {}

for model_name, model in models.items():
    # Entraînement du modèle
    model.fit(X_train, y_train)
    # Prédiction sur l'ensemble de test
    y_pred = model.predict(X_test)
    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    # Stockage des résultats
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Affichage des résultats
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.772241,0.781299,0.772241,0.768613
Support Vector Machine,0.789994,0.796519,0.789994,0.788824
Random Forest,0.804519,0.808906,0.804519,0.802929


# IV. Possibilité de regrouper certaines classes minoritaires