In [1]:
# Importation des librairie necessaire

import re
import pandas as pd
import csv
import os
import copy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *



nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:


def ouvrirFichier(non_fichier):
    '''
    Objectif : Ouvre un fichier dont le chemin est passé en paramètre et renvoie son contenu
    
    Entrée :
    - chemin du fichier
    
    Sortie :
    - le contenue du fichier
    '''
    
    fichier = open(non_fichier, "r")
    contenu = fichier.read()
    return contenu

In [3]:

def concatDataset(path_dossier,nomFichier, extension = ".txt"):
    '''
    Objectif : Trouve tout les fichier txt d'un dossier donnéselon l'extension passer en paramètre et les 
    concatènent dans un seul et meme fichier 
    
    Entrée :
    - chemin du dossier
    - nom du fichier de sortie
    - extension du fichier de sortie [default = ".txt"]
    '''
    
    listeFichierTrouve = [path_dossier + "/" + file for file in os.listdir(path_dossier) if file.endswith(extension)]
    
    fileComplet = open(nomFichier + extension,'a')
    contenu= ""

    for element in listeFichierTrouve :
        fichier = open(element,"r")
        contenu += fichier.read()
        fichier.close()

    fileComplet.write(contenu)
  


In [4]:
def transformeFileData (contenu, nomDataset) :
    '''
    Objectif : Compose un dataset au format csv a partir d'un fichier txt non nettoyé
    
    Entrée :
    - contenu du fichier txt
    - nom du fichier csv de sortie
    
    Sortie :
    - le dataFrame enregistré dans le fichier csv
    '''
    
    element = contenu.split("</pair>")

    list_label =[]
    list_attribut =[]

    for ligne in (element):
        liste = (re.findall("entailment=\".*\"",ligne))
        tex = (re.findall("<t>\n.*\n</t>",ligne))

        for r in tex:
            phrase= re.sub("<.*?>","",r)
            phrase= phrase.replace("\n","")
            list_attribut.append(phrase)
            #print(phrase)

        for mot in liste:
            label=(re.findall("\".*?\"" , mot)[0])
            label=label.replace('"','')
            list_label.append(label)
            #print(label)


    data_frame = pd.DataFrame()

    for label, text in zip(list_attribut,list_label):
        line = pd.DataFrame([[label, text]])
        data_frame = pd.concat([data_frame, line])

    data_frame.columns=["Text", "Label"]
    data_frame = data_frame.reset_index(drop=True)

    data_frame.to_csv(nomDataset + ".csv")
    
    return data_frame



In [10]:
def lemmatisation(listeMot):
    
    
    lemmatizer = WordNetLemmatizer()
        
    for element in listeMot :
        element = lemmatizer.lemmatize(element)

    return listeMot

In [6]:
def racinisation(listeMot):
    stemmer = PorterStemmer()

    return [stemmer.stem(element) for element in listeMot]

In [7]:
       
def tokenization(dataframe, colFeature = ["Text"]) :
    '''
    Objectif : Tokenize les features textuel dun dataFrame
    
    Entrée :
    - le dataFrame dorigine
    - la liste des colonnes à tokenizer [default = ["Text"]]
    
    Sortie :
    - le contenue du fichier
    '''
    dataCopie = copy.deepcopy(dataframe)
    
    for col in colFeature:
        for indexLigne in range (0,len(dataCopie[col])):

            # remplace le texte par une liste de tokens
            featureTokenization = nltk.word_tokenize(dataCopie[col][indexLigne])

            # nettoyage de la ponctuation
            listePonctuation =  [",","'",".","?","!","''","``",")","]","(","["]
            featureTokenization_clean = [element for element in featureTokenization if element not in listePonctuation]

            # lemmatisation
            featureLemmatisation = lemmatisation(featureTokenization_clean)

            # racinisation
            featureRacine = racinisation(featureLemmatisation)

            dataCopie.set_value(indexLigne,col,featureRacine)
                
                
    return dataCopie




In [8]:
# test Part of speech

# ouverture du dataset complet
# nettoyage des balise html
# tokenisation
# tagging
# count frequency

import nltk
import re
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

text = "he is a boy"
file = open("dataset_complet.txt")
contenu = file.read()
contenu = re.sub("<.*?>","",contenu)

text = nltk.word_tokenize(contenu)
tagged_text=nltk.pos_tag(text)


tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_text)
most = tag_fd.most_common()
print(most)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'dataset_complet.txt'

In [None]:

'''from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
nltk.download('words')



def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []
    
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)
            current_chunk = []
        else:
            continue
    return continuous_chunk


my_sent = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
get_continuous_chunks(my_sent)
['WASHINGTON', 'New York', 'Loretta E. Lynch', 'Brooklyn']

'''

In [9]:
# Execution

if __name__ == "__main__":
    nomFichier = "dataset_complet"
    extension = ".csv"
    path = "data"


    if not os.path.isfile(nomFichier + extension):
        concatDataset(path,nomFichier)
        contenu = ouvrirFichier(nomFichier + ".txt")
        dataFrame = transformeFileData(contenu,nomFichier)

    else :
        dataFrame = pd.read_csv(nomFichier + extension) 

    dataFrameTokenise = tokenization(dataFrame)
    print(dataFrameTokenise)



                                                   Text    Label
0     [crude, oil, for, april, deliveri, trade, at, ...       NO
1     [oracl, had, fought, to, keep, the, form, from...       NO
2     [all, genet, modifi, food, includ, soya, or, m...      YES
3     [research, at, the, harvard, school, of, publi...       NO
4     [eat, lot, of, food, that, are, a, good, sourc...      YES
5     [the, yanke, split, hollywood, with, someth, t...      YES
6     [scientist, at, the, genom, institut, of, sing...  UNKNOWN
7     [phish, disband, after, a, final, concert, in,...  UNKNOWN
8     [euro-scandinavian, media, cheer, denmark, v, ...      YES
9     [iraqi, milit, said, sunday, they, would, behe...      YES
10    [two, turkish, engin, and, an, afghan, transla...      YES
11    [If, a, mexican, approach, the, border, he, 's...  UNKNOWN
12    [iran, will, soon, releas, eight, british, ser...  UNKNOWN
13    [the, wait, time, for, a, green, card, ha, ris...      YES
14    [coal, compani, sto