<a href="https://colab.research.google.com/github/MANOJ21K/NLP/blob/main/Intro_to_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U spacy[cuda110,transformers,lookups]==3.0.3
!pip install -U spacy-lookups-data==1.0.0
!pip install cupy-cuda110==8.5.0
!python -m spacy download en_core_web_trf

In [None]:
# Import spacy and download language model
import spacy
nlp = spacy.load("en_core_web_trf")

In [None]:
#TOKENIZATION
"""
Tokenization is where all NLP work begins; before the machine can process any of the text it sees, it must break the text into bite-sized tokens.
Tokenization will segment text into words
"""

sentence = nlp.tokenizer("We live in India.")

#Length of sentence
print("The number of tokens: ", len(sentence))

#Print individual words (i.e., tokens)
print("The tokens: ")
for words in sentence:
    print(words)

The number of tokens:  5
The tokens: 
We
live
in
India
.


In [None]:
#Steps to Authenticate kaggle account
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#Toeknization on complex data

#getting data from kaggle using copy API 
!kaggle datasets download -d tunguz/200000-jeopardy-questions



Downloading 200000-jeopardy-questions.zip to /content
  0% 0.00/11.5M [00:00<?, ?B/s] 43% 5.00M/11.5M [00:00<00:00, 49.4MB/s]
100% 11.5M/11.5M [00:00<00:00, 93.1MB/s]


In [None]:
!unzip 200000-jeopardy-questions.zip


Archive:  200000-jeopardy-questions.zip
  inflating: JEOPARDY_CSV.csv        


In [None]:
import pandas as pd
df = pd.read_csv("JEOPARDY_CSV.csv")

In [None]:
df = pd.DataFrame(df[' Question'])
df.columns = ['Questions']
df

Unnamed: 0,Questions
0,"For the last 8 years of his life, Galileo was ..."
1,No. 2: 1912 Olympian; football star at Carlisl...
2,The city of Yuma in this state has a record av...
3,"In 1963, live on ""The Art Linkletter Show"", th..."
4,"Signer of the Dec. of Indep., framer of the Co..."
...,...
216925,This Puccini opera turns on the solution to 3 ...
216926,In North America this term is properly applied...
216927,"In Penny Lane, where this ""Hellraiser"" grew up..."
216928,"From Ft. Sill, Okla. he made the plea, Arizona..."


In [None]:
sentence = nlp.tokenizer(df['Questions'][0])
for i in sentence:
  print(i)

In [None]:
#PART OF SPEECH TAGGING

sentence = nlp(df['Questions'][0])

# Print Part-of-speech tags for tokens in the first question
print("Here are the Part-of-speech tags for each token in the first question:")
for token in sentence:
    print(token.text,token.pos_, spacy.explain(token.pos_))

Here are the Part-of-speech tags for each token in the first question:
For ADP adposition
the DET determiner
last ADJ adjective
8 NUM numeral
years NOUN noun
of ADP adposition
his PRON pronoun
life NOUN noun
, PUNCT punctuation
Galileo PROPN proper noun
was AUX auxiliary
under ADP adposition
house NOUN noun
arrest NOUN noun
for ADP adposition
espousing VERB verb
this DET determiner
man NOUN noun
's PART particle
theory NOUN noun


In [None]:
#DEPENDENCY PARSING

# Print Dependency Parsing tags for tokens in the first question
for token in sentence:
    print(token.text,token.dep_, spacy.explain(token.dep_))

For prep prepositional modifier
the det determiner
last amod adjectival modifier
8 nummod numeric modifier
years pobj object of preposition
of prep prepositional modifier
his poss possession modifier
life pobj object of preposition
, punct punctuation
Galileo nsubj nominal subject
was ROOT root
under prep prepositional modifier
house compound compound
arrest pobj object of preposition
for prep prepositional modifier
espousing pcomp complement of preposition
this det determiner
man poss possession modifier
's case case marking
theory dobj direct object


In [None]:
# Visualize the dependency parse
from spacy import displacy

displacy.render(sentence, style='dep',
                jupyter=True, options={'distance': 120})

In [None]:
#CHUNKING

# Print chunks for example sentence 1
for chunk in nlp("My parents live in New York City.").noun_chunks:
      print(chunk.text)


In [None]:
# Print chunks for example sentence 2
for chunk in nlp(df['Questions'][0]).noun_chunks:
      print(chunk.text)

the last 8 years
his life
Galileo
house arrest
this man's theory


In [None]:
#Lemmatization

# Print lemmatization for tokens in the first question
lemmatization = pd.DataFrame(data=[], columns=["original","lemmatized"])
i = 0
for token in sentence:
    lemmatization.loc[i,"original"] = token.text
    lemmatization.loc[i,"lemmatized"] = token.lemma_
    i = i+1

lemmatization

Unnamed: 0,original,lemmatized
0,For,for
1,the,the
2,last,last
3,8,8
4,years,year
5,of,of
6,his,his
7,life,life
8,",",","
9,Galileo,Galileo


In [None]:
#NAMED ENTITY RECOGNITION

# Print NER results
example_sentence = """George Washington was an American political leader, 
military general, statesman, and Founding Father who served as the 
first president of the United States from 1789 to 1797.\n"""

print(example_sentence)


doc = nlp(example_sentence)
for token in doc.ents:
    print(token.text, token.start_char, token.end_char, token.label_)

George Washington was an American political leader, 
military general, statesman, and Founding Father who served as the 
first president of the United States from 1789 to 1797.

George Washington 0 17 PERSON
American 25 33 NORP
first 121 126 ORDINAL
the United States 140 157 GPE
1789 to 1797 163 175 DATE


In [None]:
# Visualize NER results
displacy.render(doc, style='ent', jupyter=True, options={'distance': 120})

In [None]:
#NAMED ENTITY LINKAGE


# Import libraries
import requests
import json

# Define Google Knowledge Graph API Result function
def returnGraphResult(query, key, entityType):
    if entityType == "PERSON":
        google = f"https://kgsearch.googleapis.com/v1/entities:search?query={query}&key={key}&limit=1&indent=True"
        resp = requests.get(google)
        if resp.status_code == 200:
            try:
                result = json.loads(resp.text)
                if 'itemListElement' in result and len(result['itemListElement']) > 0:
                    url = result['itemListElement'][0]['result'].get('detailedDescription', {}).get('url', 'no_match')
                    description = result['itemListElement'][0]['result'].get('detailedDescription', {}).get('articleBody', 'no_match')
                    return url, description
            except json.JSONDecodeError:
                pass
    return "no_match", "no_match"


In [None]:
# Print Wikipedia descriptions and URLs for entities
doc = nlp(example_sentence)
key="AIzaSyD3JMgqBKS-pIveGJ_cUhQWcZYfJBrsq7k"

for token in doc.ents:
   url, description = returnGraphResult(token.text, key, entityType)
   print(token.text, token.label_, url, description)


George Washington PERSON https://en.wikipedia.org/wiki/George_Washington George Washington was an American military officer, statesman, and Founding Father who served as the first president of the United States from 1789 to 1797. 
American NORP https://en.wikipedia.org/wiki/American_Airlines American Airlines is a major US-based airline headquartered in Fort Worth, Texas, within the Dallas–Fort Worth metroplex. It is the largest airline in the world when measured by fleet size, scheduled passengers carried, and revenue passenger mile. 
first ORDINAL https://en.wikipedia.org/wiki/La_Liga The Campeonato Nacional de Liga de Primera División, commonly known simply as Primera División in Spain, and as La Liga in English-speaking countries and officially as LaLiga Santander for sponsorship reasons, stylized as LaLiga, is the men's top professional football division of the Spanish football league system. 
the United States GPE https://en.wikipedia.org/wiki/Joe_Biden Joseph Robinette Biden Jr.

**CHAPTER 2: TRANSFORMERS**

What I learnt in this chapter:
1. Using fast.ai to built text_classifier 
2. Using hugging face to build the next word predictor for a sentence

In [None]:
""" 
fastai is more than your standard deep learning library. It includes tools that help
you solve the problem at hand end-to-end as fast as possible. One of those tools is a
built-in set of common datasets that can be easily downloaded
"""

' \nfastai is more than your standard deep learning library. It includes tools that help\nyou solve the problem at hand end-to-end as fast as possible. One of those tools is a\nbuilt-in set of common datasets that can be easily downloaded\n'

In [None]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)
dls = TextDataLoaders.from_folder(path, valid='test')
dls.show_batch()

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [None]:
learn.fine_tune(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time


epoch,train_loss,valid_loss,accuracy,time
0,0.461801,0.443504,0.79596,2:17:46


epoch,train_loss,valid_loss,accuracy,time


In [None]:
learn.show_results()

In [None]:
learn.predict("That movie was wicked cool!")

**Using hugging face lib to predict the next text**

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pretrained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "With great power comes great "
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
print(tokens_tensor)

tensor([[3152, 1049, 1176, 2058, 1049,  220]])


In [None]:
# Load pretrained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')


# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# Predict all tokens
with torch.no_grad():
 outputs = model(tokens_tensor)
 predictions = outputs[0]


# Get the predicted next subword
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
print(predicted_text)


With great power comes great ills


**CHAPTER3 - NLP TASKS AND APPLICATIONS**

In [None]:
#lets work on NER and text classification on AG News classification dataset available in kaggle

Use below code to authenticate to kaggle by uploading the json file dowloaded from kaggle profile before getting the data here

In [None]:
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

#!pip install kaggle

Saving kaggle (1).json to kaggle (1) (6).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#getting data from kaggle using copy API 
!kaggle datasets download -d amananandrai/ag-news-classification-dataset

401 - Unauthorized


In [None]:
!unzip ag-news-classification-dataset.zip

unzip:  cannot find or open ag-news-classification-dataset.zip, ag-news-classification-dataset.zip.zip or ag-news-classification-dataset.zip.ZIP.


In [None]:
# Import libraries
import pandas as pd
import os
# Get current working directory
cwd = os.getcwd()
# Import AG Dataset
data = pd.read_csv("train.csv")

In [None]:
data.head()
#defination of class Index (1-World, 2-Sports, 3-Business, and 4-Sci/Tech)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [None]:
data.columns = data.columns.str.replace(" ","_")
data.columns = data.columns.str.lower()
data["class_name"] = data["class_index"].map({1:"World", 2:"Sports",
 3:"Business", 4:"Sci_Tech"})

In [None]:
data.head()

Unnamed: 0,class_index,title,description,class_name
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Business
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Business
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Business
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Business
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",Business


In [None]:
# Count observations by class
data.class_name.value_counts()

Business    30000
Sci_Tech    30000
Sports      30000
World       30000
Name: class_name, dtype: int64

In [None]:
# Clean up text
cols = ["title","description"]
data[cols] = data[cols].applymap(lambda x: x.replace("\\"," "))
data[cols] = data[cols].applymap(lambda x: x.replace("#36;","$"))
data[cols] = data[cols].applymap(lambda x: x.replace(" "," "))
data[cols] = data[cols].applymap(lambda x: x.strip())

In [None]:
# Write data to CSV
data.to_csv('train_prepared.csv', index=False)

Lets work on NER hands on using spacy

In [None]:
!pip install -U spacy[cuda110,transformers,lookups]==3.0.3
!pip install -U spacy-lookups-data==1.0.0
!pip install cupy-cuda110==8.5.0
!python -m spacy download en_core_web_trf

In [None]:
# Import spacy and load language model
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# View metadata of the model
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(nlp.meta)

{   'author': 'Explosion',
    'components': [   'tok2vec',
                      'tagger',
                      'parser',
                      'senter',
                      'attribute_ruler',
                      'lemmatizer',
                      'ner'],
    'description': 'English pipeline optimized for CPU. Components: tok2vec, '
                   'tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
    'disabled': ['senter'],
    'email': 'contact@explosion.ai',
    'labels': {   'attribute_ruler': [],
                  'lemmatizer': [],
                  'ner': [   'CARDINAL',
                             'DATE',
                             'EVENT',
                             'FAC',
                             'GPE',
                             'LANGUAGE',
                             'LAW',
                             'LOC',
                             'MONEY',
                             'NORP',
                             'ORDINAL',
                    

In [None]:
# Print NER results for Descriptions
for i in range(9):
 print("Article",i)
 print(data.loc[i,"description"])
 print("Text Start End Label")
 doc = nlp(data.loc[i,"description"])
 for token in doc.ents:
  print(token.text, token.start_char,
  token.end_char, token.label_)
 print("\n")


Article 0
Reuters - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.
Text Start End Label
Reuters 0 7 ORG


Article 1
Reuters - Private investment firm Carlyle Group, which has a reputation for making well-timed and occasionally controversial plays in the defense industry, has quietly placed its bets on another part of the market.
Text Start End Label
Reuters 0 7 ORG
Carlyle Group 34 47 ORG


Article 2
Reuters - Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums.
Text Start End Label
Reuters - Soaring 0 17 ORG
next week 134 143 DATE
summer 168 174 DATE


Article 3
Reuters - Authorities have halted oil export flows from the main pipeline in southern Iraq after intelligence showed a rebel militia could strike infrastructure, an oil official said on Saturday.
Text Start End Label
Reuters - Authorities 0 21 ORG
Iraq 86 90 GPE
Saturda

Text classification

In [None]:
import random
import spacy
from spacy.util import minibatch, compounding
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the AG News dataset
df = pd.read_csv("path/to/ag_news_csv/train.csv", header=None, names=["class", "title", "description"])
# Convert class labels to integers
df["class"] = df["class"].astype("category").cat.codes

# Split dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Define the TextCategorizer pipeline component
def create_textcat(nlp):
    textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
    for i in range(df["class"].nunique()):
        textcat.add_label(str(i))
    return textcat

# Define the training function
def train(model, train_data, optimizer):
    losses = {}
    random.seed(42)
    nlp = spacy.load(model)
    textcat = nlp.get_pipe("textcat")
    with nlp.disable_pipes("textcat"):
        # Train only the textcat component
        optimizer = textcat.begin_training()
        for i in range(10):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, labels = zip(*batch)
                nlp.update(texts, labels, sgd=optimizer, losses=losses)
            print(f"Loss at iteration {i}: {losses['textcat']:.3f}")

# Define the testing function
def test(model, test_data):
    nlp = spacy.load(model)
    textcat = nlp.get_pipe("textcat")
    texts = test_data["title"].tolist()
    true_labels = test_data["class"].tolist()
    preds = []
    for doc in nlp.pipe(texts):
        preds.append(doc.cats)
    pred_labels = [max(d, key=d.get) for d in preds]
    print(classification_report(true_labels, pred_labels))

# Initialize the model and add the TextCategorizer component
nlp = spacy.load("en_core_web_sm")
textcat = create_textcat(nlp)
nlp.add_pipe(textcat)

# Train the model
train(nlp, train_data[["title", "class"]].to_records(index=False), "Adam")

# Test the model
test(nlp, test_data)

ConfigValidationError: ignored