In [1]:
#imports
import csv
import pandas as pd
import nltk #import the natural language
from nltk.stem.snowball import FrenchStemmer 
from nltk.corpus import stopwords 
import re 
from collections import Counter 

In [5]:
def read_raw_file(path):
    '''reads in raw text from a text file using the argument (path), which represents the path/to/file'''
    f = open(path,"r") #open the file located at "path" as a file object (f) that is readonly
    raw = f.read()#.decode('utf8') # read raw text into a variable (raw) after decoding it from utf8
    f.close() #close the file now that it isn;t being used any longer
    return raw

def get_tokens(raw,encoding='utf8'):
    '''get the nltk tokens from a text'''
    tokens = nltk.word_tokenize(raw) #tokenize the raw UTF-8 text
    return tokens

def get_nltk_text(raw,encoding='utf8'):
    '''create an nltk text using the passed argument (raw) after filtering out the commas'''
    #turn the raw text into an nltk text object
    no_commas = re.sub(r'[.|,|\']',' ', raw) #filter out all the commas, periods, and appostrophes using regex
    tokens = nltk.word_tokenize(no_commas) #generate a list of tokens from the raw text
    text=nltk.Text(tokens,encoding) #create a nltk text from those tokens
    return text

def filter_stopwords(text,stopword_list):
    '''normalizes the words by turning them all lowercase and then filters out the stopwords'''
    words=[w.lower() for w in text] #normalize the words in the text, making them all lowercase
    #filtering stopwords
    filtered_words = [] #declare an empty list to hold our filtered words
    for word in words: #iterate over all words from the text
        if word not in stopword_list and word.isalpha() and len(word) > 1: #only add words that are not in the French stopwords list, are alphabetic, and are more than 1 character
            filtered_words.append(word) #add word to filter_words list if it meets the above conditions
    filtered_words#.sort() #sort filtered_words list
    return filtered_words

def get_stopswords(type="veronis"):
    '''returns the veronis stopwords in unicode, or if any other value is passed,
    it returns the default nltk french stopwords'''
    if type=="veronis":
        #VERONIS STOPWORDS
        raw_stopword_list = ["Ap.", "Apr.", "GHz", "MHz", "USD", "a", "afin", "ah", "ai", "aie", "aient","aies", "ait", "alors", "après", "as", "attendu", "au", "au-delà", "au-devant","aucun", "aucune", "audit", "auprès", "auquel", "aura", "aurai", "auraient",
                             "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autour", "autre", "autres", "autrui", "aux", "auxdites", "auxdits",    "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez",
                             "aviez", "avions", "avons", "ayant", "ayez", "ayons", "b", "bah", "banco", "ben","bien", "bé", "c", "c'", "c'est", "c'était", "car", "ce", "ceci", "cela", "celle",
                             "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci","celui-là", "celà", "cent", "cents", "cependant", "certain", "certaine",
                             "certaines", "certains", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là","cf.", "cg", "cgr", "chacun", "chacune", "chaque", "chez", "ci", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", 
                             "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "cl", "cm", "cm²", "comme", "contre",
                             "d", "d'", "d'après", "d'un", "d'une", "dans", "de", "depuis", "derrière", "des", "desdites", "desdits", "desquelles", "desquels", "deux", "devant", "devers", "dg", "différentes", "différents", "divers", "diverses", "dix", 
                             "dix-huit", "dix-neuf", "dix-sept", "dl", "dm", "donc", "dont", "douze", "du", "dudit", "duquel", "durant", "dès", "déjà", "e", "eh", "elle", "elles",
                             "en", "en-dehors", "encore", "enfin", "entre", "envers", "es", "est", "et", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses",
                             "eussiez", "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "f", "fait", "fi", "flac", "fors", "furent", "fus", "fusse", "fussent", "fusses", "fussiez",                             "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gr", "h", "ha", "han", "hein", "hem", "heu", "hg", "hl", "hm", "hm³", "holà", "hop", "hormis", "hors", "huit", 
                             "hum", "hé", "i", "ici", "il", "ils", "j", "j'", "j'ai", "j'avais", "j'étais",   "jamais", "je", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'à", "jusque", "k","kg", "km", "km²", "l", "l'", "l'autre", "l'on", "l'un", "l'une", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lez", "lors", "lorsqu'", "lorsque", "lui", "lès", "m", "m'", "ma", "maint", "mainte", "maintes", "maints", "mais", "malgré", "me", "mes", "mg", "mgr", "mil", "mille", "milliards", "millions", "ml", "mm", "mm²", "moi", "moins", "mon", "moyennant", "mt", "m²", "m³", "même", "mêmes", "n", "n'avait", "n'y", "ne", "neuf", "ni", "non", "nonante", "nonobstant", "nos", "notre", "nous", "nul", "nulle", "nº", "néanmoins", "o", "octante", "oh", "on", "ont", "onze", "or", "ou", "outre", "où", "p", "par", "par-delà", "parbleu", "parce", "parmi", "pas", "passé", "pendant", "personne", "peu", "plus", "plus_d'un", "plus_d'une", "plusieurs", "pour", "pourquoi", "pourtant", "pourvu", "près", "puisqu'", "puisque", "q", "qu", "qu'", "qu'elle", "qu'elles", "qu'il", "qu'ils", "qu'on", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "que", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelques", "quelques-unes", "quelques-uns", "quels", "qui", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "r", "revoici", "revoilà", "rien", "s", "s'", "sa", "sans", "sauf", "se", "seize", "selon", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "si", "sinon", "six", "soi", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "son", "sont", "sous", "soyez", "soyons", "suis", "suite", "sur", "sus", "t", "t'", "ta", "tacatac", "tandis", "te", "tel", "telle", "telles", "tels", "tes", "toi", "ton", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "très", "tu", "u", "un", "une", "unes", "uns", "v", "vers", "via", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-à-vis", "voici", "voilà", "vos", "votre", "vous", "w", "x", "y", "z", "zéro", "à", "ç'", "ça", "ès", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"]
    else:
        #get French stopwords from the nltk kit
        raw_stopword_list = stopwords.words('french') #create a list of all French stopwords
    stopword_list = [word for word in raw_stopword_list] #make to decode the French stopwords as unicode objects rather than ascii
    return stopword_list

def create_csv_file(filter_word, PATH, querry):
    liste  = filtered_word
    final = " "
    i = 0
    lista = []
    with open(PATH +querry+'.csv', mode='w') as employee_file:
        employee_writer = csv.writer(employee_file, delimiter = " ")

        for i in range(0, len(liste)):
            if ((i % 130) == 0):
                employee_writer.writerow(lista)
                lista  =  []
                lista.append(liste[i])
            else :
                lista.append(liste[i])


In [6]:
if __name__=="__main__":
    PATH_corpus_csv  ="/home/nchet/Documents/M2_BIG_DATA/ANNA_PAPPA/Projet_finale/corpus_csv/"
    PATH_Data    ="/home/nchet/Documents/M2_BIG_DATA/ANNA_PAPPA/Projet_finale/Data/"
    PATH_corpus  ="/home/nchet/Documents/M2_BIG_DATA/ANNA_PAPPA/Projet_finale/CORPUS/"
    col    = ['description', 'label']
    for element in os.listdir(PATH_Data):
        if element.endswith('.txt'):
            name  = element.split('.')
            #path ="./Data/voiture.txt"
            raw = read_raw_file(PATH_Data+element)
            tokens = get_tokens(raw)
            #print( "taille of file before feltring is \n", len(tokens) , "this is a sampling " , tokens[0:15])
            text = get_nltk_text(raw)
            stopword_list = get_stopswords()
            #print("taille of file is", len(stopword_list))
            #print(stopword_list)
            filtered_word = filter_stopwords(text,stopword_list)
            #print(filtered_word)
            create_csv_file(filtered_word, PATH_corpus_csv, name[0])
            #print("taille of file after filtring is", len(filtered_word)
    for element in os.listdir(PATH_corpus_csv):
        if element.endswith('.csv'):
            print(element)
            name  = element.split('.')
            df  = pd.read_csv(PATH_corpus_csv+element, sep=';', header= None)
            df['description'] = df[:]
            df['label'] = name[0]
            
            df.rename(columns={"0": 'description', 'label': 'label'}, inplace=True)
            
            df1 = pd.DataFrame({'description': df['description'],
                              'label' : df['label']})
            df1.to_csv(PATH_corpus+"Dataset.csv", mode = 'a',columns =col, header=True, sep=";", index=False)
            
            #df.to_csv(PATH_corpus+name[0]+".csv", columns =col, header=True, sep=";", index=False)
        else :
            pass
    


informatique.csv
science.csv
technologie.csv
hotel.csv


In [37]:

def stem_words(words):
    '''stems the word list using the French Stemmer'''
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    stemmed_words.sort() #sort the stemmed_words
    return stemmed_words

if __name__ == "__main__":
    hello = ['maison', 'montagne', 'il', 'voir', 'manger', 'savoir', 'cours',  'cour', 'téléphone', 'prochaine']
    stemmed_words = stem_words(hello)
    print("talle is ", len(stemmed_words), stemmed_words)

talle is  10 ['cour', 'cour', 'il', 'maison', 'mang', 'montagn', 'prochain', 'savoir', 'téléphon', 'voir']


In [8]:
mon_dictionnaire = {}
mon_dictionnaire["pseudo"] = ["Prolixe", "her"]
mon_dictionnaire["mot de passe"] = ["dfdProlixe", "herfdf"]
mon_dictionnaire
#{'mot de passe': '*', 'pseudo': 'Prolixe'}


{'pseudo': ['Prolixe', 'her'], 'mot de passe': ['dfdProlixe', 'herfdf']}

In [24]:
numbers = [1,2,3,4,5,1,4,5] 
  
# start parameter is not provided 
Sum = sum(numbers) 
print(Sum) 

25


In [25]:
dictionary = {"raj": 2, "striver": 3, "vikram": 4} 
print(dictionary.values())   

dict_values([2, 3, 4])


In [26]:
sum(dictionary.values())

9

In [31]:
import nltk
sentence = """ aujourd'hui il fait super beau et ce soir aussi . """
tokens = nltk.word_tokenize(sentence)
tokens

["aujourd'hui",
 'il',
 'fait',
 'super',
 'beau',
 'et',
 'ce',
 'soir',
 'aussi',
 '.']

In [32]:
tagged = nltk.pos_tag(tokens)
tagged[0:6]

[("aujourd'hui", 'NN'),
 ('il', 'NN'),
 ('fait', 'VBP'),
 ('super', 'JJR'),
 ('beau', 'NN'),
 ('et', 'FW')]