# Lab NLP

## Challenge 1 - Installations

In [1]:
import nltk

In [2]:
from nltk.corpus import brown
nltk.download('brown')

print(brown.words()[0:10])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


[nltk_data] Downloading package brown to /Users/Livia/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
print(brown.tagged_words()[0:10])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


In [4]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'




In [5]:
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

sent_tokenize(text)

[nltk_data] Downloading package punkt to /Users/Livia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Ironhack is a Global Tech School ranked num 2 worldwide.',
 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.',
 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.',
 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']

In [6]:
word_tokenize(text)

['Ironhack',
 'is',
 'a',
 'Global',
 'Tech',
 'School',
 'ranked',
 'num',
 '2',
 'worldwide',
 '.',
 'Our',
 'mission',
 'is',
 'to',
 'help',
 'people',
 'transform',
 'their',
 'careers',
 'and',
 'join',
 'a',
 'thriving',
 'community',
 'of',
 'tech',
 'professionals',
 'that',
 'love',
 'what',
 'they',
 'do',
 '.',
 'This',
 'ideology',
 'is',
 'reflected',
 'in',
 'our',
 'teaching',
 'practices',
 ',',
 'which',
 'consist',
 'of',
 'a',
 'nine-weeks',
 'immersive',
 'programming',
 ',',
 'UX/UI',
 'design',
 'or',
 'Data',
 'Analytics',
 'course',
 'as',
 'well',
 'as',
 'a',
 'one-week',
 'hiring',
 'fair',
 'aimed',
 'at',
 'helping',
 'our',
 'students',
 'change',
 'their',
 'career',
 'and',
 'get',
 'a',
 'job',
 'straight',
 'after',
 'the',
 'course',
 '.',
 'We',
 'are',
 'present',
 'in',
 '8',
 'countries',
 'and',
 'have',
 'campuses',
 'in',
 '9',
 'locations',
 '-',
 'Madrid',
 ',',
 'Barcelona',
 ',',
 'Miami',
 ',',
 'Paris',
 ',',
 'Mexico',
 'City',
 ',',
 '

## Challenge 2 - Preparing Text Data For Analysis

In [7]:
in_put = "@Ironhack's-#Q website 776-is http://ironhack.com [(2018)])"
out_put = 'ironhack s  q website  is'

In [8]:
import re

def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    
    s = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', s) # para limpiar URL
    s = re.sub('\d', ' ', s) # /d Any numeric character
    s = re.sub('\W', ' ', s) # Any non-alphanumeric character
    return s.lower().strip()

print(clean_up(in_put))

ironhack s  q website     is


### Tokenization

In [9]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """

    return word_tokenize(clean_up(s))

print(tokenize(in_put))

['ironhack', 's', 'q', 'website', 'is']


### Stemming and Lemmatization

In NLTK, there are three stemming libraries: [*Porter*](https://www.nltk.org/_modules/nltk/stem/porter.html), [*Snowball*](https://www.nltk.org/_modules/nltk/stem/snowball.html), and [*Lancaster*](https://www.nltk.org/_modules/nltk/stem/lancaster.html). The difference among the three is the agressiveness with which they perform stemming. Porter is the most gentle stemmer that preserves the word's original form if it has doubts. In contrast, Lancaster is the most aggressive one that sometimes produces wrong outputs. And Snowball is in between. **In most cases you will use either Porter or Snowball**.

In [10]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('was')

lemmatizer.lemmatize('runs', pos='v')

[nltk_data] Downloading package wordnet to /Users/Livia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'run'

In [11]:
from nltk.stem import SnowballStemmer

def stem_and_lemmatize(s):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    return [WordNetLemmatizer().lemmatize(SnowballStemmer('english').stem(x)) for x in tokenize(s)]
    
    
print(stem_and_lemmatize(in_put)) # nos ha quitado una 'e' en websit

['ironhack', 's', 'q', 'websit', 'is']


### Stop Words Removal

Stop Words are the most commonly used words in a language that don't contribute to the main meaning of the texts. Examples of English stop words are `i`, `me`, `is`, `and`, `the`, `but`, and `here`. We want to remove stop words from analysis because otherwise stop words will take the overwhelming portion in our tokenized word list and the NLP algorithms will have problems in identifying the truely important words.

NLTK has a `stopwords` package that allows us to import the most common stop words in over a dozen langauges including English, Spanish, French, German, Dutch, Portuguese, Italian, etc. These are the bare minimum stop words (100-150 words in each language) that can get beginners started. Some other NLP packages such as [*stop-words*](https://pypi.org/project/stop-words/) and [*wordcloud*](https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html) provide bigger lists of stop words.

Now in your Jupyter Notebook, create a function called `remove_stopwords` that loop through a list of words that have been stemmed and lemmatized to check and remove stop words. Return a new list where stop words have been removed.


In [14]:
from nltk.corpus import stopwords 
nltk.download('stopwords')


def remove_stopwords(s):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
       """
    
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in stem_and_lemmatize(s) if not w in stop_words]  
    
    return filtered_sentence
        
print(remove_stopwords(in_put))

[nltk_data] Downloading package stopwords to /Users/Livia/nltk_data...


['ironhack', 'q', 'websit']


[nltk_data]   Unzipping corpora/stopwords.zip.


## Challenge 3: Sentiment Analysis

In [16]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

txt = "Ironhack is a Global Tech School ranked num 2 worldwide.   Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(txt)




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Livia/nltk_data...


{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.8442}

In [19]:
import pandas as pd

sen = pd.read_csv('../Sentiment140.csv')
sen.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [20]:
short = sen[:5000] # Lo acortamos para que no tarde muchísimo en ejecutarse porque son 1.6 millones de rows

Ahora aplicamos la ristra de funciones definidas en el challenge 2 en la columna text y creamos una columna con text_processed

In [30]:
short['text_processed'] = short['text'].apply(remove_stopwords) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
#def clean_f(x): 
 #   functions = [clean_up, tokenize, stem_and_lemmatize, remove_stopwords]
  #  for f in functions: 
   # x = f(x)
    #return x
#short['text_processed']=short.text.apply(clean_f)

In [32]:
short.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, zl, awww, bummer, shoulda, got, d..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, whi, becaus, see]"


### Creating Bag of Words

The purpose of this step is to create a bag of words from the processed data. The bag of words contains all the unique words in your whole text body (a.k.a. corpus) with the number of occurrence of each word. It will allow you to understand which words are the most important features across the whole corpus.

Also, you can imagine you will have a massive set of words. The less important words (i.e. those of very low number of occurrence) do not contribute much to the sentiment. Therefore, you only need to use the most important words to build your feature set in the next step. In our case, we will use the top 5,000 words with the highest frequency to build the features.

In your Jupyter Notebook, combine all the words in text_processed and calculate the frequency distribution of all words. A convenient library to calculate the term frequency distribution is NLTK's FreqDist class (documentation). Then select the top 5,000 words from the frequency distribution.

In [52]:
words = []
for x in short.text_processed:
    words += x

In [53]:
#words

In [54]:
from nltk.probability import FreqDist

fdist = FreqDist(words)
# sorted(fdist, key=fdist.get, reverse=True)[:5000]

voc = fdist.most_common(5000)
bag_of_words = [x[0] for x in voc]
#bag_of_words

In [64]:
def find_features(document):
    words = set(document)
    features = {}
    for w in bag_of_words:
        features[w] = (w in words)
        
    s = SentimentIntensityAnalyzer().polarity_scores(" ".join(document))
    if s["pos"] > 0.2:
        s = True
    else:
        s = False

    return (features, s)

feature = short.text_processed.apply(find_features)
feature[4000][1] # True or False if the word is positive or negative.

True

### Testing Naïve Bayes Model

Now we'll test our classifier with the test dataset. This is done by calling nltk.classify.accuracy(classifier, test).

As mentioned in one of the tutorial videos, a Naive Bayes model is considered OK if your accuracy score is over 0.6. If your accuracy score is over 0.7, you've done a great job!

In [65]:
# set that we'll train our classifier with
training_set = feature[:1900]

# set that we'll test against.
testing_set = feature[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 76.96774193548387


## Bonus Question 1 & 2: Improve Model Performance & Machine Learning Pipeline

If you are still not exhausted so far and want to dig deeper, try to improve your classifier performance. There are many aspects you can dig into, for example:

Improve stemming and lemmatization. Inspect your bag of words and the most important features. Are there any words you should furuther remove from analysis? You can append these words to further remove to the stop words list.

Remember we only used the top 5,000 features to build model? Try using different numbers of top features. The bottom line is to use as few features as you can without compromising your model performance. The fewer features you select into your model, the faster your model is trained. Then you can use a larger sample size to improve your model accuracy score.

In a new Jupyter Notebook, combine all your codes into a function (or a class). Your new function will execute the complete machine learning pipeline job by receiving the dataset location and output the classifier. **This will allow you to use your function to predict the sentiment of any tweet in real time**.

In [None]:
def download(file):
    s = pd.read_csv('../Sentiment140.csv')
    return s

def clean_up(s):
    sen = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', s) # para limpiar URL
    s = re.sub('\d', ' ', s) # /d Any numeric character
    s = re.sub('\W', ' ', s) # Any non-alphanumeric character
    return s.lower().strip()

def tokenize(s):
    s = word_tokenize(clean_up(s))
    return s

def stem_and_lemmatize(s):
    s = [WordNetLemmatizer().lemmatize(SnowballStemmer('english').stem(x)) for x in tokenize(s)]
    return s

def remove_stopwords(s):
    stop_words = set(stopwords.words('english'))
    s = [w for w in stem_and_lemmatize(s) if not w in stop_words]  
    return s

def sampling(s):
    sample = s[:5000] # Lo acortamos para que no tarde muchísimo en ejecutarse porque son 1.6 millones de rows
    return sample

def applying(sample):
    sample['text_processed'] = sample['text'].apply(remove_stopwords) 
    return sample

def bagOfWords(sample):
    words = [words += x for x in sample.text_processed]
    fdist = FreqDist(words)
    sorted(fdist, key=fdist.get, reverse=True)[500:1000]
    return sample
    
def find_features(sample):
    words = set(sample)
    features = {}
    for w in bag_of_words:
        features[w] = (w in words)
        s = SentimentIntensityAnalyzer().polarity_scores(" ".join(sample))
        if s["pos"] > 0.2:
            s = True
        else:
            s = False
        return features

def applyFindFeatures(sample):
    feature = sample.text_processed.apply(find_features)
    return feature
   
def trainModel(feature):
    training_set = feature[:1900]
    testing_set = feature[1900:]
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    accuracy_percentage = nltk.classify.accuracy(classifier, testing_set)*100
    return accuracy_percentage
