<a href="https://colab.research.google.com/github/JonNData/Python-Skills/blob/master/NLP_GUIDE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Basic NLP Reference Guide 


##Imports

In [0]:
import seaborn as sns
import pandas as pd
import numpy as np
import genism
import sys
import re
import os
import squarify
import matplotlib.pyplot as plt
import spacy
from collections import Counter
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from genism.utils import simple_preprocess
from genism.parsing.preprocessing import STOPWORDS
from genism import corpora
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel


In [0]:
#regex pattern which keeps only alphanumeric characters:
re.sub('[^a-zA-Z 0-9]', '', INSERTNAMEHERE)

In [0]:
#function which takes a corpus of a document and returns a dataframe of word counts to analyze:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

##Tokenization 

In [0]:
nlp = spacy.load("en_core_web_lg")

STOP_WORDS = nlp.Defaults.stop_words.union(['insert your stop words here'])

tokenizer = Tokenizer(nlp.vocab)


def tokenize(dataframe):
    tokens = []
    """ Update those tokens w/o stopwords"""
    for doc in tokenizer.pipe(df['reviews.text'], batch_size=500):
      
        doc_tokens = []

        for token in doc:
            if token.text.lower() not in STOP_WORDS:
                doc_tokens.append(token.text.lower())

        tokens.append(doc_tokens)
        
    return tokens

?df?['tokens'] = tokenize(df)

##Stemming

In [0]:
ps = PorterStemmer()

words = ["wolf", "wolves"]

for word in words:
    print(ps.stem(word))

##Lemmatization 

In [0]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

##Vector Representations

In [0]:
# Count Vectorizer
vect = CountVectorizer(stop_words='english')

#learn vocab
vect.fit(data)

#get sparse dtm
dtm = vect.transform(data0)

dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())

In [0]:
# TF-IDF Vectorizer

#instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        tokenizer=tokenize)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(data)

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

In [0]:
#PCA

def get_word_vectors(words):
  #converts a list of words into their word vectors
  return [nlp(word).vector for word in words]

words = ['car', 'truck', 'suv', 'race', 'elves', 'dragon', 'sword', 'king', 'queen', 'prince', 'horse', 'fish' , 'lion', 'tiger', 'lynx', 'potato']
 
# intialise pca model and tell it to project data down onto 2 dimensions
pca = PCA(n_components=2)

# fit the pca model to our 300D data, this will work out which is the best 
# way to project the data down that will best maintain the relative distances 
# between data points. It will store these intructioons on how to transform the data.
pca.fit(get_word_vectors(words))

# Tell our (fitted) pca model to transform our 300D data down onto 2D using the 
# instructions it learnt during the fit phase.
word_vecs_2d = pca.transform(get_word_vectors(words))

# let's look at our new 2D word vectors
word_vecs_2d

##Document Classification

In [0]:
# Basic NLP pipeline
from sklearn.pipeline import Pipeline

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
clf = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', clf)])

In [0]:
from sklearn.decomposition import TruncatedSVD

# Singular value decomposition
svd = TruncatedSVD(n_components=100, # Just here for demo. 
                   algorithm='randomized',
                   n_iter=10)


# LSI: Latent semantic indexing
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

##Text Feature Extraction & Classification Pipelines

In [0]:
#create pipeliune components
vect = TfidVectorizer(stop_words='english', ngram_range=(1,2))
rfc = RandomForestClassifier()

In [0]:
#define the pipeline:
pipe = Pipeline([ 
                 #vectorizer
                 ('vect', vect), 
                 #classifier
                 ('clf', rfc)
                 ])

In [0]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(data.data, data.target)

In [0]:
grid_search.best_score_

In [0]:
grid_search.predict([])

##Latent Semantic Indexing 

In [0]:

svd = TruncatedSVD(n_components=100, # Just here for demo. 
                   algorithm='randomized',
                   n_iter=10)

In [0]:
params = { 
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df':[.9, .95, 1.0],
    'clf__n_estimators':[5,10,20]
}

In [0]:
# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

print(pipe)

In [0]:
# Fit
grid_search = GridSearchCV(pipe,params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(data.data, data.target)

In [0]:
grid_search.best_score_

##Word Embeddings with Spacy

In [0]:
doc = nlp("Two bananas in pyjamas")

def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [0]:
X = get_word_vectors(train['description'])

len(X) == len(data.data)

In [0]:
X_test = get_word_vectors(test['description'])

In [0]:
rfc.fit(X, train['ratingCategory'])

In [0]:
rfc.score(X, train['ratingCategory'])

In [0]:
rfc.predict(X_test)

In [0]:
test['ratingCategory'] = rfc.predict(X_test)

In [0]:
test[['id', 'ratingCategory']].to_csv('testSolutionSubmission.csv', header=True, index=False)

##Topic Modelings

In [0]:
#Genism LDA 

id2word = corpora.Dictionary(tokens)# tokens represents a list of tokenized strings lists

# Let's remove extreme values from the dataset
id2word.filter_extremes(no_below=5, no_above=0.95)

corpus = [id2word.doc2bow(text) for text in tokens]

lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   random_state=723812,
                   num_topics = 15,
                   passes=10,
                   workers=8
                  )

In [0]:
pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(lda, corpus, id2word)

distro = [lda[d] for d in corpus]

def update(doc):
        d_dist = {k:0 for k in range(0,15)}
        for t in doc:
            d_dist[t[0]] = t[1]
        return d_dist
    
new_distro = [update(d) for d in distro]