In [1]:
#imports
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

### 1/ The end result of this exercise should be a file named ```prepare.py``` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

Define a function named ```basic_clean```. It should take in a string and apply some basic text cleaning to it:  
        Lowercase everything  
        Normalize unicode characters  
        Replace anything that is not a letter, number, whitespace or a single quote.  


In [2]:
# le texte (un poème)

mots = 'Good rain knows the right time / To fall when spring comes / And follows the wind in the night / Silently, moistening every thing / A wild path black and cloudy / A riverboat fire alone and bright / Dawn has the look that is red and wet / In Brocade City, Guancheng* / "Spring Night, Happy Rain" by Du Fu, written in Chengdu, 759 – 763, during the An Lushan Rebellion.'

mots

'Good rain knows the right time / To fall when spring comes / And follows the wind in the night / Silently, moistening every thing / A wild path black and cloudy / A riverboat fire alone and bright / Dawn has the look that is red and wet / In Brocade City, Guancheng* / "Spring Night, Happy Rain" by Du Fu, written in Chengdu, 759 – 763, during the An Lushan Rebellion.'

In [3]:
# lowercasing

mots = mots.lower()

In [4]:
# removing accents, etc : normalisation

mots = unicodedata.normalize('NFKD', mots).encode('ascii', 'ignore').decode('utf-8')
mots

'good rain knows the right time / to fall when spring comes / and follows the wind in the night / silently, moistening every thing / a wild path black and cloudy / a riverboat fire alone and bright / dawn has the look that is red and wet / in brocade city, guancheng* / "spring night, happy rain" by du fu, written in chengdu, 759  763, during the an lushan rebellion.'

In [5]:
# remove special characters (punctuation)

mots = re.sub(r'[^a-z0-9\s]',' ',mots)
mots

'good rain knows the right time   to fall when spring comes   and follows the wind in the night   silently  moistening every thing   a wild path black and cloudy   a riverboat fire alone and bright   dawn has the look that is red and wet   in brocade city  guancheng     spring night  happy rain  by du fu  written in chengdu  759  763  during the an lushan rebellion '

In [6]:
# function to clean a phrase

def basic_clean(mots):
    
    '''
    this function takes in a string, then 
    prepares it for parsing by lowercasing, 
    normalising and removing special characters
    '''
    
    # lowercasing
    mots = mots.lower()
    
    # removing accents, etc : normalisation
    mots = unicodedata.normalize('NFKD', mots).encode('ascii', 'ignore').decode('utf-8')
    
    # remove special characters (punctuation)
    mots = re.sub(r'[^a-z0-9\s]',' ',mots)

    return mots

In [7]:
# the function

basic_clean(mots)

'good rain knows the right time   to fall when spring comes   and follows the wind in the night   silently  moistening every thing   a wild path black and cloudy   a riverboat fire alone and bright   dawn has the look that is red and wet   in brocade city  guancheng     spring night  happy rain  by du fu  written in chengdu  759  763  during the an lushan rebellion '

### 2/ Define a function named tokenize. It should take in a string and tokenize all the words in the string.


In [8]:
#tokeniser object

tok = nltk.tokenize.ToktokTokenizer()
tok

<nltk.tokenize.toktok.ToktokTokenizer at 0x163587fd0>

In [9]:
# each token

mots = tok.tokenize(mots)
mots

['good',
 'rain',
 'knows',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'comes',
 'and',
 'follows',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silently',
 'moistening',
 'every',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudy',
 'a',
 'riverboat',
 'fire',
 'alone',
 'and',
 'bright',
 'dawn',
 'has',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocade',
 'city',
 'guancheng',
 'spring',
 'night',
 'happy',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'during',
 'the',
 'an',
 'lushan',
 'rebellion']

In [10]:
# tokeniser function

def tokenise(mots):
    
    '''this function takes in a cleaned
    string and breaks it down into tokens
    for nlp stemming and lemmatising
    '''
    #tokeniser object
    tok = nltk.tokenize.ToktokTokenizer()
    
    # each token
    mots = tok.tokenize(mots)
    
    return mots

In [11]:
tokenise(mots)

['[',
 "'",
 'good',
 "'",
 ',',
 "'",
 'rain',
 "'",
 ',',
 "'",
 'knows',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'right',
 "'",
 ',',
 "'",
 'time',
 "'",
 ',',
 "'",
 'to',
 "'",
 ',',
 "'",
 'fall',
 "'",
 ',',
 "'",
 'when',
 "'",
 ',',
 "'",
 'spring',
 "'",
 ',',
 "'",
 'comes',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'follows',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'wind',
 "'",
 ',',
 "'",
 'in',
 "'",
 ',',
 "'",
 'the',
 "'",
 ',',
 "'",
 'night',
 "'",
 ',',
 "'",
 'silently',
 "'",
 ',',
 "'",
 'moistening',
 "'",
 ',',
 "'",
 'every',
 "'",
 ',',
 "'",
 'thing',
 "'",
 ',',
 "'",
 'a',
 "'",
 ',',
 "'",
 'wild',
 "'",
 ',',
 "'",
 'path',
 "'",
 ',',
 "'",
 'black',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'cloudy',
 "'",
 ',',
 "'",
 'a',
 "'",
 ',',
 "'",
 'riverboat',
 "'",
 ',',
 "'",
 'fire',
 "'",
 ',',
 "'",
 'alone',
 "'",
 ',',
 "'",
 'and',
 "'",
 ',',
 "'",
 'bright',
 "'",
 ',',
 "'",
 'dawn',
 "'",
 ',',
 "'",
 'has',
 "'",
 ',',
 "'",
 

### 3/ Define a function named ```stem```. It should accept some text and return the text after applying stemming to all the words.


In [12]:
# create the stemmer obj

ps = nltk.porter.PorterStemmer()

In [13]:
# stemming each word

[ps.stem(mot) for mot in mots]

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follow',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silent',
 'moisten',
 'everi',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudi',
 'a',
 'riverboat',
 'fire',
 'alon',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocad',
 'citi',
 'guancheng',
 'spring',
 'night',
 'happi',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'dure',
 'the',
 'an',
 'lushan',
 'rebellion']

In [16]:
# assigning variable

stems = [ps.stem(mot) for mot in mots]

In [15]:
# joining stems into a clump of text


stems = [ps.stem(word) for word in mots]
' '.join(stems)

'good rain know the right time to fall when spring come and follow the wind in the night silent moisten everi thing a wild path black and cloudi a riverboat fire alon and bright dawn ha the look that is red and wet in brocad citi guancheng spring night happi rain by du fu written in chengdu 759 763 dure the an lushan rebellion'

In [17]:
# function to stem a string

def stem(mots):
    '''
    this function takes in a string, and 
    stems it using the Porter Stemmer
    '''
    # create the stemmer obj
    ps = nltk.porter.PorterStemmer()

    # stemming each word
    stems = [ps.stem(mot) for mot in mots]

    # joining stems into a clump of text
    stems = [ps.stem(word) for word in mots]
    ' '.join(stems)
    
    return stems

In [18]:
# faire fonctionner le fonction

stem(mots)

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follow',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silent',
 'moisten',
 'everi',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudi',
 'a',
 'riverboat',
 'fire',
 'alon',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocad',
 'citi',
 'guancheng',
 'spring',
 'night',
 'happi',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'dure',
 'the',
 'an',
 'lushan',
 'rebellion']

### 4/ Define a function named ```lemmatize```. It should accept some text and return the text after applying lemmatization to each word.


In [19]:
# lemmatiser obj
wln = nltk.stem.WordNetLemmatizer()

In [20]:
# join the lemmatised words

lemmas = [wln.lemmatize(mot) for mot in mots]
' '.join(lemmas)

'good rain know the right time to fall when spring come and follows the wind in the night silently moistening every thing a wild path black and cloudy a riverboat fire alone and bright dawn ha the look that is red and wet in brocade city guancheng spring night happy rain by du fu written in chengdu 759 763 during the an lushan rebellion'

In [30]:
# lemmatising function

def lemmatize(mots):
    
    '''this function takes in a string
    and lemmatises it
    
    # lemmatiser obj
    wln = nltk.stem.WordNetLemmatizer()
    
    # join the lemmatised words

    lemmas = [wln.lemmatize(stem) for stem in stems]
    ' '.join(lemmas)
    
    return lemmas

In [31]:
lemmatize(stems)

['good',
 'rain',
 'know',
 'the',
 'right',
 'time',
 'to',
 'fall',
 'when',
 'spring',
 'come',
 'and',
 'follow',
 'the',
 'wind',
 'in',
 'the',
 'night',
 'silent',
 'moisten',
 'everi',
 'thing',
 'a',
 'wild',
 'path',
 'black',
 'and',
 'cloudi',
 'a',
 'riverboat',
 'fire',
 'alon',
 'and',
 'bright',
 'dawn',
 'ha',
 'the',
 'look',
 'that',
 'is',
 'red',
 'and',
 'wet',
 'in',
 'brocad',
 'citi',
 'guancheng',
 'spring',
 'night',
 'happi',
 'rain',
 'by',
 'du',
 'fu',
 'written',
 'in',
 'chengdu',
 '759',
 '763',
 'dure',
 'the',
 'an',
 'lushan',
 'rebellion']