# LAB 5. LSTM FOR TEXT CLASSIFICATION & SENTIMENT ANALYSIS

In [86]:
"""1. Load the pipeline and the en_core_web_md modules
2. Show the components considered in the pipeline
3. Load the SA dataset from Campus Virtual
4. Explore the dataset to describe it
5. Add the text categorizer component (using a multilabel model) to the pipeline
6. Add two labels: positive and negative sentiments
7. Create the comments’ samples
8. Initialize the pipeline
9. Enable the text categorizer component to be trained
10. Create an optimizer object (resume_training) to keep weights of existing statistical
models
11. Set 5 training epochs, and loss values
12. Test new data"""

'1. Load the pipeline and the en_core_web_md modules\n2. Show the components considered in the pipeline\n3. Load the SA dataset from Campus Virtual\n4. Explore the dataset to describe it\n5. Add the text categorizer component (using a multilabel model) to the pipeline\n6. Add two labels: positive and negative sentiments\n7. Create the comments’ samples\n8. Initialize the pipeline\n9. Enable the text categorizer component to be trained\n10. Create an optimizer object (resume_training) to keep weights of existing statistical\nmodels\n11. Set 5 training epochs, and loss values\n12. Test new data'

### 1. Load the pipeline and the en_core_web_md modules

In [87]:
#Load the pipeline and the en_core_web_md modules
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### 2. Show the components considered in the pipeline

In [88]:
#Show the components considered in the pipeline
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


### 3. Load the SA dataset from Campus Virtual

In [89]:
#Load the SA dataset from Campus Virtual
import pandas as pd

sadataset = pd.read_csv("./contents/SA_dataset.csv")

In [90]:
sadataset.head()

Unnamed: 0,Review,Rating,Sentiment
0,**Possible Spoilers**,1,0
1,"Read the book, forget the movie!",2,0
2,**Possible Spoilers Ahead**,2,0
3,"What a script, what a story, what a mess!",2,0
4,I hope this group of film-makers never re-unites.,1,0


### 4. Explore the dataset to describe it

In [91]:
#Explore the dataset to describe it
print(sadataset.describe())


            Rating    Sentiment
count  5000.000000  5000.000000
mean      5.902200     0.550000
std       3.653944     0.497543
min       1.000000     0.000000
25%       2.000000     0.000000
50%       7.000000     1.000000
75%      10.000000     1.000000
max      10.000000     1.000000


In [92]:
#Get rating distribution
rating_distribution = sadataset['Rating'].value_counts()
print(rating_distribution)
#Now print it in percentages 
rating_distribution = sadataset['Rating'].value_counts(normalize=True)
print(rating_distribution)

Rating
10    1385
1     1061
8      520
9      472
3      401
4      401
2      387
7      373
Name: count, dtype: int64
Rating
10    0.2770
1     0.2122
8     0.1040
9     0.0944
3     0.0802
4     0.0802
2     0.0774
7     0.0746
Name: proportion, dtype: float64


### 5. Add the text categorizer component (using a multilabel model) to the pipeline

In [93]:
# Add the text categorizer component (using a multilabel model) to the pipeline
textcat= nlp.add_pipe("textcat")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']


### 6. Add two labels: positive and negative sentiments

In [94]:
#Add two labels: positive and negative sentiments
nlp.get_pipe("textcat").add_label("positive")
nlp.get_pipe("textcat").add_label("negative")


1

### 7. Create the comments’ samples

In [95]:
sadataset.tail()

Unnamed: 0,Review,Rating,Sentiment
4995,"I have only seen this once--in 1986, at an ""ar...",10,1
4996,"This being my first John Carpenter film, I mus...",9,1
4997,"This is kind of a weird movie, given that Sant...",1,0
4998,"Vic (Richard Dreyfuss) is a mob boss, leaving ...",4,0
4999,"Yup, that's right folks, this is undoubtedly t...",1,0


In [108]:
# Training the model
from sklearn.model_selection import train_test_split

X = sadataset['Review'].values
y = sadataset['Sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
# Training the model
from spacy.training import Example

def create_examples(X, y):
    examples = []
    for text, label in zip(X, y):
        examples.append(Example.from_dict(nlp.make_doc(text), {'cats': {'positive': int(label), 'negative': int(not label)}}))
    return examples

train_examples = create_examples(X_train, y_train)
test_examples = create_examples(X_test, y_test)

### 8. Initialize the pipeline

In [111]:
#Spacy's Example class is used to create the training data
from spacy.training.example import Example
input = Example.from_dict(nlp.make_doc("This is a good product"), {"cats": {"positive": 1, "negative": 0}})
textcat.initialize(lambda: [input], nlp = nlp)

### 9. Enable the text categorizer component to be trained

### 10. Create an optimizer object (resume_training) to keep weights of existing statistical models

In [112]:

#Create an optimizer object (resume_training) to keep weights of existing statistical models
from spacy.util import minibatch
import random
from sklearn.model_selection import train_test_split

random.seed(1)
spacy.util.fix_random_seed(1)

optimizer = nlp.resume_training()

### 11. Set 5 training epochs, and loss values

In [113]:


#Set 5 training epochs, and loss values
num_epochs = 5
for i in range(num_epochs):
    dicti = {}
    nlp.update(X_train, sgd=optimizer, losses=dicti, drop=0.2)
    print(dicti)
    
    

TypeError: [E978] The Language.update method takes a list of Example objects, but got: {<class 'str'>}

### 12. Test new data

In [72]:
#Test new data
test_text = "This movie sucked, you should not see it"

doc = nlp(test_text)
doc.cats



{'POSITIVE': 0.5115741491317749, 'NEGATIVE': 0.4884258508682251}

In [73]:
import numpy as np
test_text = "This movie was the best one I have ever seen, i loved it"

doc = nlp(test_text)
doc.cats

{'POSITIVE': 0.36717143654823303, 'NEGATIVE': 0.6328285336494446}