# Utilisation de l'API Azure Text Analytics

#### 1-Lecture de la clé d'API et instantiation du client virtuel : 

In [1]:
k=open("macle.txt","r")
key = k.readline()
endpoint = "https://apita.cognitiveservices.azure.com/"

In [3]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [4]:
# cd C:\\Users\\Wince\\Downloads\\OC\\Projet_7\\data

#### 2-Importation du fichier contenant les twits à tester : 

Ouverture du fichier échantillon de test et preprocessing : 

In [6]:
import pandas as pd
import contractions
import re
import string
import nltk
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers

print("imports_done")

def string_cleanup(text):
    output = contractions.fix(text)
    output = re.sub(r'http?://\S+', '', output, flags=re.MULTILINE)
    output = re.sub(r'@\w+', '', output, flags=re.MULTILINE)
    output = [char.lower() if char not in string.punctuation else ' ' for char in output ]
    output = ''.join(output)
    return output

def lemmatize(tokenlist): 
    wnl = WordNetLemmatizer()
    out = [wnl.lemmatize(word) for word in tokenlist]
    return out    

def text_preprocessing(df):
    print('Text preprocessing initiated')
    print('string cleanup in progress...')
    df.loc[:,'cleaned_text'] = df.text.apply(string_cleanup)
    print('string cleanup OK, now tokenizing')
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    df.loc[:, 'tokens'] = df.loc[
        :, 'cleaned_text'].apply(lambda x: tokenizer.tokenize(x))
    print('Tokenizing OK - now lemmatizing...')
    df.loc[:, 'lems'] = df.loc[:, 'tokens'].apply(lemmatize)
    df.loc[:, 'lems'] = df.lems.apply(' '.join)
    print('Lemmatizing OK')
    print('Text preprocessing finished')
    return df


test_sample = pd.read_csv(r'C:\\Users\\Lewin\\Downloads\\OC\\Projet_7\\upload\\test_file.csv')
test_sample = text_preprocessing(test_sample)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lewin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


imports_done
Text preprocessing initiated
string cleanup in progress...
string cleanup OK, now tokenizing
Tokenizing OK - now lemmatizing...
Lemmatizing OK
Text preprocessing finished


In [7]:
test_sample.head()#[1290:1300].cleaned_text.to_numpy()

Unnamed: 0.1,Unnamed: 0,text,sentiment,cleaned_text,tokens,lems
0,194030,In bed. Not wanting to do a 10 hour shift toda...,0,in bed not wanting to do a 10 hour shift toda...,"[in, bed, not, wanting, to, do, a, 10, hour, s...",in bed not wanting to do a 10 hour shift today...
1,427946,@Shontelle_Layne Wish i could go!,0,wish i could go,"[wish, i, could, go]",wish i could go
2,761589,Really needs to start tanning the RIGHT arm ON...,0,really needs to start tanning the right arm on...,"[really, needs, to, start, tanning, the, right...",really need to start tanning the right arm onl...
3,701510,Hey @officialTila we lost you on trending topics,0,hey we lost you on trending topics,"[hey, we, lost, you, on, trending, topics]",hey we lost you on trending topic
4,166150,Venus Williams is having a horrible day at the...,0,venus williams is having a horrible day at the...,"[venus, williams, is, having, a, horrible, day...",venus williams is having a horrible day at the...


In [8]:
len(test_sample)

1600

#### 3-Envoi de requêtes au service : 
On fait d'abord un essai sur la ou les premières lignes du jeu de données : 

In [8]:

document = []
for n in range(1):
    document.append({"id": str(n), "language": "en", "text":test_sample.loc[n,'cleaned_text']})
response = client.analyze_sentiment(document)
successful_responses = [doc for doc in response if not doc.is_error ]
successful_responses



##### Envoi de la requête à l'API, récupération des résultats puis mise en forme pour l'évaluation de la performance prédictive du modèle sur étagère : <br>

Suite à des plantages répétés, un code de gestion des erreurs a été mis en place afin d'en identifier les causes ( caractères exotique et texte absent). 

In [8]:
import numpy as np
import time
from IPython.display import clear_output
successful_responses = []
for n in range(0,len(test_sample),5):   
    clear_output(wait=True)
    print(f'run {n} in progress')
    document = []
    for i in range(n,n+5):
        if len(test_sample.cleaned_text[test_sample.index[i]]) > 1:
            document.append({"id": str(i), "language": "en", "text":test_sample.cleaned_text[test_sample.index[i]]})
        else: 
            print(f'Index {i} has no content and can not be processed') 
    response = client.analyze_sentiment(document)
    successful_responses = successful_responses + [doc for doc in response if not doc.is_error ]
    time.sleep(0.2)
successful_indices = np.unique([int(doc.id) for doc in successful_responses])
if len(successful_indices) < test_sample.shape[0]: 
    full = [i for i in range(1600)]
    list_difference = []
    for item in full:
        if item not in successful_indices:
            list_difference.append(item)
    print(f'retrying indices : {list_difference}')
    for i in list_difference:
        if len(test_sample.cleaned_text[test_sample.index[i]]) > 1:
            document.append({"id": str(i), "language": "en", "text":test_sample.cleaned_text[test_sample.index[i]]})
            successful_responses = successful_responses + [doc for doc in response if not doc.is_error ]
        else: 
            print(f'Index {i} has no content and can not be processed') 
    successful_indices = np.unique([int(doc.id) for doc in successful_responses])
if len(successful_indices) < test_sample.shape[0]: 
    full = [i for i in range(1600)]
    list_difference = []
    for item in full:
        if item not in successful_indices:
            list_difference.append(item)
    print(f'following indices could not be analyzed : {list_difference}')    

run 1595 in progress
retrying indices : [272, 1534]
Index 272 has no content and can not be processed
Index 1534 has no content and can not be processed
following indices could not be analyzed : [272, 1534]


#### 4- Evaluation des réponses : 

Il nous manque certaines lignes dont le nettoyage a supprimé l'intégralité du texte : 

In [12]:
len(successful_responses)

1598

On va donc construire notre vecteur de référence en éliminant les lignes non-analysées : 

In [9]:
y_true = test_sample.loc[[int(doc.id) for doc in successful_responses],'sentiment'].to_numpy()
len(y_true)

1598

Puis le vecteur de prédictions, binarisé en considérant uniquement le score de confiance sur la classe négative : 

In [10]:
indices=[]
y_proba = []
y_pred = []
for doc in successful_responses:
    y_proba.append(doc.confidence_scores.negative)
    indices.append(doc.id)
    y_pred.append(0 if doc.confidence_scores.negative >= 0.5 else 1)

In [12]:
len(y_pred)

1598

In [13]:
import numpy as np
y_pred=np.array(y_pred)

Comparaison des prédictions aux étiquettes du dataset : 

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[449, 369],
       [ 95, 685]], dtype=int64)

In [16]:
pd.DataFrame(confusion_matrix(y_true, y_pred).ravel().reshape(1,-1), columns=["tn", "fp", "fn", "tp"])

Unnamed: 0,tn,fp,fn,tp
0,449,369,95,685


In [17]:
from sklearn.metrics import roc_auc_score
print(f'ROC_AUC du modèle API sur étagère : {roc_auc_score(y_true, y_pred)}')

ROC_AUC du modèle API sur étagère : 0.7135524418531753


In [19]:
from sklearn.metrics import accuracy_score
print(f'Exactitude du modèle API sur étagère : {accuracy_score(y_true, y_pred)}')

ROC_AUC du modèle API sur étagère : 0.7096370463078848
