In [40]:
import os
from os import walk
import glob
import numpy as np
import pandas as pd
import string
import re
import langdetect
import json
from sklearn.utils import shuffle
import warnings
from random import randrange
import pymongo
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import RegexpStemmer
import spacy
from spacy.lang.fr.examples import sentences 
from spacy import displacy
warnings.filterwarnings('ignore')

nlp = spacy.load("fr_core_news_sm")
DATASET_PATH = '/Users/jonathankhalifa/Desktop/T-AIA-901/BOOTSTRAP/discours/tous'


<br><br>

## Dataset creation and preprocessing funcs

In [42]:

def load_initial_dataset(path):
    """
    IN : path to raw text dataset on hard drive
    OUT : dataframe
    """
    discours_path = path
    dataset = {}
    fullDataset = {}
    dset = []
    filenames = next(walk(discours_path), (None, None, {}))[2] 

    for f in filenames:
        ff = open(discours_path + '/' + f, 'r')
        file_contents = ff.read()

        file_contents = file_contents.replace("\n", " ")

        x = file_contents.split('.')
        for xx in x:
            dset.append(xx)
        ff.close()
    df = pd.DataFrame(dset)
    df.rename(columns={0: 'text_input'}, inplace=True)
    return df




def inject_good_words(phrase):
    """
    IN : 1 text row from initial dataset
    OUT : text with a random ham/good word injected
    USE : inject a random ham related word at a random position in the text
    """
    good_words = ['billet','train','aller', 'retour', 'vouloir', 'souhaiter', 'acheter','billet', 'départ','retour']
    
    split_strings = phrase.split()
    inj_pos = len(split_strings)-1
    inj_pos = randrange(inj_pos)

    rand_word = len(good_words)-1
    rand_word = randrange(rand_word)
    rand_word = good_words[rand_word]

    split_strings.insert(inj_pos, rand_word)
    final_string = ' '.join(split_strings)
    return final_string




def inject_spam_words(phrase):
    """
    IN : 1 text row from initial dataset
    OUT : text with a random spam word injected
    USE : inject a random spam related word at a random position in the text
    """
    spam_words = ['avion', 'vol', 'aeroport', 'port', 'routière', 'autoroute', 'bus', 'autocar',
                  'autobus', 'remboursement', 'rembourser', 'bateau', 'voiture', 'pied', 'marcher', 'concert',
                 'dinner', 'spectacle']
    
    split_strings = phrase.split()
    
    inj_pos = len(split_strings)-1
    inj_pos = randrange(inj_pos)

    rand_word = len(spam_words)-1
    rand_word = randrange(rand_word)
    rand_word = spam_words[rand_word]

    split_strings.insert(inj_pos, rand_word)
    final_string = ' '.join(split_strings)
    return final_string




def train_test_val_split(dfnorm, dfspam, coef_ham, coef_spam):
    """
        IN : ham dataset, spam dataset
        OUT : train dataset, test dataset, validation dataset
        USE : Splits dataset into traditionnal 70/29/1. coef_ham/spam 
              to set the ratio of ham/spam in each split ex: 0.8 for 80%
    """
    l_validation = 10
    l_train = round((len(dfnorm)+len(dfspam))*0.7)-10
    l_test = round((len(dfnorm)+len(dfspam))*0.3)
    
    val1 = dfnorm.iloc[ :int(l_validation*coef_ham)]
    val2 = dfspam.iloc[ :int(l_validation*coef_spam)]
    train1 =  dfnorm.iloc[int(l_validation*coef_ham):int(l_train*coef_ham)]
    train2 =  dfspam.iloc[int(l_validation*coef_spam):int(l_train*coef_spam)]
    test1 =  dfnorm.iloc[int(l_validation*coef_ham)+int(l_train*coef_ham):]
    test2 =  dfspam.iloc[int(l_validation*coef_spam)+int(l_train*coef_spam):]
    
    # concat spam and norm
    df_train = pd.concat([train1, train2])
    df_test = pd.concat([test1, test2])
    df_val = pd.concat([val1, val2])
    
    # shuffle rows
    df_train = shuffle(df_train)
    df_test = shuffle(df_test)
    df_val = shuffle(df_val)
    
    #reset indexes
    df_val.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    result = [df_train, df_test, df_val]
    return result


    

def remove_cities_s(message):
    """
    IN: string that needs to be processed
    OUT: processed string
    USE: removes any city/country from the string
    """
    doc = nlp(message) #lower
    for city in doc.ents:
        message = message.replace(str(city), "")
    return (message)




"""
def remove_cities(message, cityList):
    """"""
    IN: string that needs to be processed
    OUT: processed string
    USE: removes cities that are listed in the DB from the string
    """"""
    doc = nlp(message) #lower
    
    def saveAllCitiesInArray():
        cities = []
        for city in doc.ents:
            cities.append(city.text)
        return cities
    cityArr = saveAllCitiesInArray()
    
    def checkCity(city):
        city = city.lower()
        city = city.replace("-", " ")
        city = city.replace("saint", "st")

        for index, row in cityList.iterrows():
            processedStopName = row['stop_name'].replace("-", " ").lower()
            if (city in processedStopName):
                message = message.replace(city, "")
                break
        return message
    
    for c in cityArr:
        message = checkCity(c)
        
    return (messsage)
"""

'\ndef remove_cities(message, cityList):\n    \n    IN: string that needs to be processed\n    OUT: processed string\n    USE: removes cities that are listed in the DB from the string\n    \n    doc = nlp(message) #lower\n    \n    def saveAllCitiesInArray():\n        cities = []\n        for city in doc.ents:\n            cities.append(city.text)\n        return cities\n    cityArr = saveAllCitiesInArray()\n    \n    def checkCity(city):\n        city = city.lower()\n        city = city.replace("-", " ")\n        city = city.replace("saint", "st")\n\n        for index, row in cityList.iterrows():\n            processedStopName = row[\'stop_name\'].replace("-", " ").lower()\n            if (city in processedStopName):\n                message = message.replace(city, "")\n                break\n        return message\n    \n    for c in cityArr:\n        message = checkCity(c)\n        \n    return (messsage)\n'

## Func to generate our dataset will do the following : 
    
    Remove all texts under 100 chars long
    Split the initial dataset in half, 50% will become spam and 50% ham
    We remove any city name from our texts. we do not want to vectorize cities since we will later do a city check
    We label our dataset
    We have 50% spam and 50% ham
    We add good words to our ham texts and bad words to our spam texts
    We overload each spam text with bad words in order to make the bad words weigh more as spam than the good words weigh as ham.
    

In [43]:

def generate_labeled_dataset(df):
    """
    IN : initial text dataset
    OUT : 3 dataframes (train 70%, test 30%, val 10rows) 
    USE : Used to create the final dataset that will be used for the spam 
          filter training/testing. Dataset is splitted, labelled norm or spam
          and injected with words according to label.
          Coef_ham/spam to set the ratio of ham/spam ex: 0.8 for 80%
    """
    
    coef_ham = 0.5
    coef_spam = 0.5
    
    
    # remove texts with under 100 chars in length
    for index, row in df.iterrows():
        if len(row['text_input']) < 100:
            df = df.drop([index])
            
            
    df['text_input'] = df['text_input'].transform(remove_cities_s)
    
    

    
    # split in half
    a = round(len(df)*coef_ham)
    dfnorm = df.iloc[ :a+5]
    dfspam = df.iloc[ a-5:]
    
    dfnorm['text_input'] = dfnorm['text_input'].transform(inject_good_words)
    dfnorm['text_input'] = dfnorm['text_input'].transform(inject_good_words)
    dfnorm['text_label'] = "ham"
    
    #inj bad words
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_input'] = dfspam['text_input'].transform(inject_spam_words)
    dfspam['text_label'] = "spam"
    
    result = train_test_val_split(dfnorm, dfspam, coef_ham, coef_spam)
    

    return result




<br/><br/>
## As result of our dataset generation, we obtain a fully labelled and splited dataset

In [44]:
df = load_initial_dataset(DATASET_PATH)

df_train, df_test, df_validate = generate_labeled_dataset(df)

<br><br>

## Funcs for preprocessing user input

#### In this phase, we will do the following to each text :

    set to lowercase
    remove punctuation
    remove special chars
    remove stop words
    remove extra spaces


In [45]:
## preprocess for before vectorizing/training

def preprocess_string(string):
    """
    IN : user input
    OUT : cleaned user input
    USE : will set all to lowercase, remove punctuation and stopwords,
          remove trailing and double spaces
    """
    # set all to lowercase
    string = string.lower()
    # remove punct
    string = string.replace('[^\w\s]',' ')
    # remove stop words
    stop = stopwords.words('french')
    string = ' '.join([word for word in string.split(" ") if word not in stopwords.words('french')])
    # replace double space by single space
    string = string.replace('  ',' ')
    # strip spaces
    string = string.strip()
    return string

    
def preprocess_df(df):
    """
    IN : df of user inputs
    OUT : cleaned df of user inputs
    USE : will set all to lowercase, remove punctuation and stopwords,
          remove trailing and double spaces
    """
    df['text_input'] = df['text_input'].str.lower()
    df['text_input'] = df['text_input'].str.replace('[^\w\s]',' ')
    stop = stopwords.words('french')
    df['text_input'] = df['text_input'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    df['text_input'] = df['text_input'].str.replace('  ',' ')
    df['text_input'] = df['text_input'].str.strip()
    return df





<br><br>
## We define our training model and it's related funcs

In [46]:
def extract_vocab(df_train):
    """
    IN : train datasset
    OUT : list of each unique word in the dataset
    """
    df_train['text_input'] = df_train['text_input'].str.split()

    vocabulary = []
    for text in df_train['text_input']:
        for word in text:
            vocabulary.append(word)

    vocabulary = list(set(vocabulary))
    return vocabulary





def word_frequency(vocabulary, df_train):
    """
    IN : train dataset, vocab list
    OUT : dataset with word frequency matrix
    """
    word_counts_per_text = {unique_word: [0] * len(df_train['text_input']) for unique_word in vocabulary}

    for index, text in enumerate(df_train['text_input']):
        for word in text:
            word_counts_per_text[word][index] += 1

    word_counts = pd.DataFrame(word_counts_per_text)
    training_set_clean = pd.concat([df_train, word_counts], axis=1)
    return training_set_clean






def calc_constants(training_set_clean, vocabulary):
    """
    IN : train word frequency dataset , vocab list
    OUT : list of model constants
    """
    # Isolating spam and ham messages first
    spam_messages = training_set_clean[training_set_clean['text_label'] == 'spam']
    ham_messages = training_set_clean[training_set_clean['text_label'] == 'ham']

    # P(Spam) and P(Ham)
    p_spam = len(spam_messages) / len(training_set_clean)
    p_ham = len(ham_messages) / len(training_set_clean)

    # N_Spam
    n_words_per_spam_message = spam_messages['text_input'].apply(len)
    n_spam = n_words_per_spam_message.sum()

    # N_Ham
    n_words_per_ham_message = ham_messages['text_input'].apply(len)
    n_ham = n_words_per_ham_message.sum()

    # N_Vocabulary
    n_vocabulary = len(vocabulary)

    # Laplace smoothing
    alpha = 1
    return [spam_messages,ham_messages, p_spam, p_ham, n_words_per_spam_message, n_spam, n_words_per_ham_message, n_ham, n_vocabulary, alpha]








def train_model(df_train):
    """
    IN : train dataset
    OUT : list of model params
    USE : Func used to fully train the model and retrieve the weights/params
          To be runned once.
    """
    
    # we preprocess/clean each dataset
    df_train = preprocess_df(df_train)
    
    # we extract df_train s vocabulary
    vocabulary = extract_vocab(df_train)
    
    # we count the freq of each word from vocabulary in df_train
    training_set_clean = word_frequency(vocabulary, df_train)
    
    # we calculate our constants
    spam_messages, ham_messages, p_spam, p_ham, n_words_per_spam_message, n_spam, n_words_per_ham_message, n_ham, n_vocabulary, alpha = calc_constants(training_set_clean, vocabulary)
    
    # Initiate parameters
    parameters_spam = {unique_word:0 for unique_word in vocabulary}
    parameters_ham = {unique_word:0 for unique_word in vocabulary}

    # Calculate parameters
    for word in vocabulary:
        n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
        p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
        parameters_spam[word] = p_word_given_spam
        
        #print(ham_messages[word].sum())
        n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
        p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
        parameters_ham[word] = p_word_given_ham
    return [p_word_given_ham, p_word_given_spam, parameters_spam, parameters_ham, p_spam, p_ham]
    

<br/><br/>
## As result of training we get the params of our model (the weights)

In [47]:

p_word_given_ham, p_word_given_spam, parameters_spam, parameters_ham, p_spam, p_ham = train_model(df_train)

<br/><br/>
## We insert the into our cloud DB:
- The model's params
- The city list corresponding to our our stop_names

In [39]:
# LAUNCH ONLY ONCE! (DONE)
"""
# inssert items in mongo atlas DB
client = pymongo.MongoClient("mongodb+srv://admin:admin@clusteria.tvj6u.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
db = client['iadb']
dataset = db['spamfilterParams']
dataset2 = db['cities']
collection3 = db['stopNames']

# from stop_names make a list of cities and save to DB
cursor = collection3.find({})   
fields = ['stop_name']
cityList = pd.DataFrame(list(cursor), columns = fields)
cityList['stop_name']= cityList['stop_name'].str.replace("Gare de ","")


# to load data into mongodb
a = {'_id': 'p_word_given_ham', 'data': p_word_given_ham}   
b = {'_id': 'p_word_given_spam', 'data': p_word_given_spam}
c = {'_id': 'parameters_spam', 'data': parameters_spam}
d = {'_id': 'parameters_ham', 'data': parameters_ham}
e = {'_id': 'p_ham', 'data': p_ham}
f = {'_id': 'p_spam', 'data': p_spam}


# to load city data into mongodb
cityList.index = cityList.index.map(str)
g=[]
for index, row in cityList.iterrows():
    g.append(row.to_dict())  
x = dataset2.insert_many(g)

x = dataset.insert_many([a,b,c,d,e,f])
"""

'\n# inssert items in mongo atlas DB\nclient = pymongo.MongoClient("mongodb+srv://admin:admin@clusteria.tvj6u.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")\ndb = client[\'iadb\']\ndataset = db[\'spamfilterParams\']\ndataset2 = db[\'cities\']\ncollection3 = db[\'stopNames\']\n\n# from stop_names make a list of cities and save to DB\ncursor = collection3.find({})   \nfields = [\'stop_name\']\ncityList = pd.DataFrame(list(cursor), columns = fields)\ncityList[\'stop_name\']= cityList[\'stop_name\'].str.replace("Gare de ","")\n\n\n# to load data into mongodb\na = {\'_id\': \'p_word_given_ham\', \'data\': p_word_given_ham}   \nb = {\'_id\': \'p_word_given_spam\', \'data\': p_word_given_spam}\nc = {\'_id\': \'parameters_spam\', \'data\': parameters_spam}\nd = {\'_id\': \'parameters_ham\', \'data\': parameters_ham}\ne = {\'_id\': \'p_ham\', \'data\': p_ham}\nf = {\'_id\': \'p_spam\', \'data\': p_spam}\n\n\n# to load city data into mongodb\ncityList.index = cityList.index.map(str)\n

<br/><br/>
## We will now test and run our spam filter onto the test dataset in order to calc itss accuracy

In [48]:



client = pymongo.MongoClient("mongodb+srv://admin:admin@clusteria.tvj6u.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
db = client['iadb']
collection = db['spamfilterParams']


p_word_given_ham = collection.find_one({'_id': "p_word_given_ham" })
p_word_given_spam = collection.find_one({'_id': "p_word_given_spam" })
parameters_spam = collection.find_one({'_id': "parameters_spam" })
parameters_ham = collection.find_one({'_id': "parameters_ham" })
p_ham = collection.find_one({'_id': "p_ham" })
p_spam = collection.find_one({'_id': "p_spam" })


collection = db['cities']
cursor = collection.find({})
fields = ['stop_name']
cityList = pd.DataFrame(list(cursor), columns = fields)





def classify(message, p_word_given_ham, p_word_given_spam, parameters_spam, parameters_ham, p_spam, p_ham):
     
    """
    IN : model params, user input
    OUT : model s prediction (ham / spam)
    USE : func that predicts weather a user input is spam or ham by
          applying our model params to a Naive Bayes model
          also checks for FR lang, and number of cities in the input
          Used for predicting user input.
    """
    
    def detect_lang(text):
        """
        IN: string
        OUT: string
        USE: returns the lang code (ex: 'fr') from the best predicted language
        """
        result = langdetect.detect_langs(text)
        lang = str(result[0])[:2]
        return lang
    
    
    
    
    def check_two_cities(message, cityList):
        """
        IN: string, list of cities extracted from our stop_names
        OUT: int
        USE: returns number of cities from the string that correspond to a stop_name
        """
        doc = nlp(message) #lower

        def saveAllCitiesInArray():
            cities = []
            for city in doc.ents:
                cities.append(city.text)
            return cities
        cityArr = saveAllCitiesInArray()

        def checkCity(city):
            city = city.lower()
            city = city.replace("-", " ")
            city = city.replace("saint", "st")
            result = 0
            for index, row in cityList.iterrows():
                processedStopName = row['stop_name'].replace("-", " ").lower()
                if (city in processedStopName):
                    result = 1
                    break
                else:
                    result = 0
            return result


        nbCitiesConfirmed = 0
        for c in cityArr:
            nbCitiesConfirmed = nbCitiesConfirmed + checkCity(c)

        return (nbCitiesConfirmed)


    
    
    def preprocess_string(string):
        """
        IN : user input
        OUT : cleaned user input
        USE : will set all to lowercase, remove punctuation and stopwords,
              remove trailing and double spaces
        """
        # set all to lowercase
        string = string.lower()
        # remove punct
        string = string.replace('[^\w\s]',' ')
        # remove stop words
        stop = stopwords.words('french')
        string = ' '.join([word for word in string.split(" ") if word not in stopwords.words('french')])
        # replace double space by single space
        string = string.replace('  ',' ')
        # strip spaces
        string = string.strip()
        return string
    
    
    
    
    result = ""
    # check cities
    nb_of_cities = check_two_cities(message, cityList)
   
    # check lang
    lang = detect_lang(message)
    
    message = message.replace(',','')
    message = message.replace('-','')
    message = message.replace(' -','')
    message = message.replace(' /','')
    message = re.sub('\W', ' ', message)
    message = preprocess_string(message)
    
    
    message2 = ""
    doc = nlp(message)
    for token in doc:
        message2 = message2+ " "+token.lemma_

    if lang != 'fr' and len(doc) > 3:
        result = 'spam'
    else:
        if nb_of_cities < 2:
            result = 'spam'
        else:
            
            
            message2 = message2.lower().split()
            
            #print(message)
            p_spam_given_message = p_spam
            p_ham_given_message = p_ham
            
            

            for word in message2:
               if word in parameters_spam:
                  p_spam_given_message *= parameters_spam[word]

               if word in parameters_ham: 
                  p_ham_given_message *= parameters_ham[word]

            if p_ham_given_message > p_spam_given_message:
               result = 'ham'
            elif p_ham_given_message < p_spam_given_message:
               result = 'spam'
            else:
                result = 'ham'
               #result = 'Equal proabilities, have a human classify this!'
    return result



In [68]:


def inj_cities_test_df(cityList, df_test):


    for index, row in df_test.iterrows():
        
        ext_pos1= randrange(len(cityList)-5) #random.randint(5,len(cityList)-5)
        ext_pos2= randrange(len(cityList)-5) #random.randint(5,len(cityList)-5)
        rand_word1 = cityList.iloc[ext_pos1]['stop_name']  
        rand_word2 = cityList.iloc[ext_pos2]['stop_name']
        
        split_strings = row['text_input'].split()
        
        inj_pos1 = len(split_strings)-1
        inj_pos1 = randrange(inj_pos1)
        inj_pos2 = len(split_strings)-1
        inj_pos2 = randrange(inj_pos2)

        split_strings.insert(inj_pos1, rand_word1)
        split_strings.insert(inj_pos2, rand_word2)
        final_string = ' '.join(split_strings)
        row['text_input'] = final_string
    return df_test
        
        
ddf = inj_cities_test_df(cityList, df_test)


Unnamed: 0,text_input,text_label
0,Je pied veux prendre un petit temps pour vol e...,spam
1,Si demain on conteste que chez vous ce soit ch...,ham
2,"1139 Je vous annonce donc ce PEI, ces 400 mill...",ham
3,"Un billet dernier mot avant de conclure, pas p...",ham
4,Boncourt Quand on ne peut pas faire retour viv...,ham
...,...,...
3656,"marcher , femme de bus , aeroport autoroute a ...",spam
3657,Beauté de départ ces côtes aux reliefs changea...,ham
3658,"bien, deux sur trois disent qu'ils sont prêts ...",ham
3659,routière de 2020 Cosne-sur-Loire sera une de a...,spam


In [73]:
def calc_accuracy(ddf):

    goodPred = 0
    badPred = 0

    for index, row in ddf.iterrows():

        res = classify(row['text_input'],p_word_given_ham['data'], p_word_given_spam['data'], parameters_spam['data'], parameters_ham['data'], p_spam['data'], p_ham['data'])
        if(res == row['text_label']):
            goodPred = goodPred + 1
        else:
            badPred = badPred + 1


    print('Good predictions :' ,goodPred)
    print('Bad predictions :' ,badPred)
    accuracy = goodPred/(int(len(ddf)))*100
    return accuracy

acc = calc_accuracy(ddf)
print('Accuracy (%) :' ,acc)



Good predictions : 3352
Bad predictions : 309
Accuracy (%) : 91.55968314668124


Using a ssample of 1000 never seen texts, we obtain an accuracy of aprox 92%