In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from IPython.display import display
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

spacy_nlp = spacy.load("en_core_web_lg")

In [2]:
type(spacy_nlp)

spacy.lang.en.English

In [3]:
class NlpPipeline:
    
    class WrongOrderException(Exception):
        pass
    
    def __init__(self, df, text_column_name, target_name, clean_name='cleaned',
                 token_name='tokenized', nlp=spacy_nlp):
        """
        A pipeline for natural language processing
        
        [REQUIRED]
        df (DataFrame):
                The dataframe with the data
        
        [REQUIRED]
        text_column_name (str):
                The name of the column of documents
        
        [REQUIRED]
        target_name (str):
                The name of the target column
        
        [OPTIONAL] default: "cleaned"
        clean_name (str):
                The name of the column to store cleaned values in.
        
        [OPTIONAL] default: "tokenized"
        token_name (str):
                The name of the column to store tokenized values in.
                
        [OPTIONAL] default: spacy.load("en_core_web_lg")
        nlp (spacy model):
                A spacy language model for 
        """
        # Storing values
        self.container = df
        self.data_name = text_column_name
        self.target_name = target_name
        self.clean_name = clean_name
        self.token_name = token_name
        self.nlp = nlp
        
        col_names = df.columns.to_list()
        
        # Check if column names are already in dataframe
        if clean_name in col_names:
            print("WARNING: Value provided for parameter 'clean_name' is already a column.")
            
        if token_name in col_names:
            print("WARNING: Value provided for parameter 'token_name' is already a column.")
        
        
    def display(self):
        """ A function for displaying the dataframe """
        display(self.container)
        
        
    def Clean(self):
        """ A function for cleaning the dataframe's "text_col" and storing in the "clean_name" column. """
        
        self.container[self.clean_name] = self.container[self.data_name].apply(lambda x: BeautifulSoup(x).get_text()).apply(lambda x: re.sub(r'[^a-zA-Z ^0-9]', '', x))
        return self.container
    
    
    def Tokenize(self):
        """ A function for tokenizing the dataframe's "clean_col" and storing in the "token_name" column. """
        
        # Check that the required info exists. If not, throw exception
        if self.clean_name not in self.container.columns.to_list():
            raise NlpPipeline.WrongOrderException('Call .Clean() first!')
            
        self.container[self.token_name] = self.container[self.clean_name].apply(lambda x: [token.lemma_ for token in self.nlp(x) if (token.is_stop != True) and (token.is_punct != True)])
        return self.container
    
    
    def VectorizeCount(self):
        """ A function for vectorizing the dataframe's "token_name" and storing in the "word_count" var. """
        
        # Check that the required info exists. If not, throw exception
        if self.token_name not in self.container.columns.to_list():
            raise NlpPipeline.WrongOrderException('Call .Tokenize() first!')
            
        self.word_counts = Counter()
        self.container[self.token_name].apply(lambda x: self.word_counts.update(x))
        return self.word_counts
    
    
    def Vectorize(self, how='tfidf'):
        """
        A function for vectorizing the cleaned data and storing it as a dataframe
        
        [OPTIONAL] default: "tfidf"
        how (str): {"count", "tfidf"}
        """
        
        if how.lower() == 'count':
            self.vectorizer = CountVectorizer()
            
        elif how.lower() == 'tfidf':
            self.vectorizer = TfidfVectorizer(ngram_range=(1,2))
            
        dtm = self.vectorizer.fit_transform(self.container[self.clean_name].to_list())
        self.vectorized = pd.DataFrame(dtm.todense(), columns=self.vectorizer.get_feature_names())
            
        return self.vectorized
    
    
    def MakePipeline(self, steps, verbose=False):
        """
        A function for making the pipeline use in other functions
        
        [REQUIRED]
        steps (list):       
                List of (name, transform) tuples (implementing fit/transform)
                that are chained, in the order in which they are chained, with 
                the last object an estimator.
        
        [OPTIONAL] default: False
        verbose (boolean):  
                If True, the time elapsed while fitting each step will be
                printed as it is completed.
        """
        
        self.pipeline = Pipeline(steps, verbose=verbose)
        return self.pipeline
    
    
    def GridSearchCV(self, parameters, cv=5, n_jobs=None, verbose=0, data_type='vectorized'):
        """
        A function for creating and fitting a GridSearchCV model.
        
        [REQUIRED]
        parameters (dict) or (list of dictionaries):
                Dictionary with parameters names (string) as keys and lists
                of parameter settings to try as values, or a list of such
                dictionaries, in which case the grids spanned by each
                dictionary in the list are explored. This enables searching
                over any sequence of parameter settings.
                
        [OPTIONAL] default: 5
        cv (int):
                Determines the cross-validation splitting strategy.
                
        [OPTIONAL] default: None
        n_jobs (int):
                Number of jobs to run in parallel.
                
        [OPTIONAL] default: 0
        verbose (boolean):
                Controls the verbosity: the higher, the more messages.
                
        [NOT IMPLEMENTED]
        data_type (str):
                Will be used to select data to use for fitting
        """
        
        # Check that the required info exists. If not, throw exception
        if hasattr(self, 'pipeline') is False:
            raise NlpPipeline.WrongOrderException("Call .MakePipeline() first!")
        
        # Check that the required info exists. If not, throw exception
        if hasattr(self, 'vectorized') is None:
            raise NlpPipeline.WrongOrderException("Call .Vectorize() first!")
            
        self.model = GridSearchCV(self.pipeline, parameters, cv=5, n_jobs=n_jobs, verbose=1)        
        self.model.fit(self.vectorized, self.container[self.target_name])
        
        return self.model
            
        
    def Predict(self, data):
        """
        Function to run a pandas series through the model to predict the target
        
        [REQUIRED]
        data (pandas Series):
                the data to run through the model to predict the target
                
        """
        
        dtm = self.vectorizer.transform(data.to_list())
        data = pd.DataFrame(dtm.todense(), columns=self.vectorizer.get_feature_names())
            
        # Check that the required info exists. If not, throw exception
        if hasattr(self, 'model') is False:
            raise NlpPipeline.WrongOrderException("Create a model first! \nEG: my_nlp.GridSearchCV()")

        return self.model.predict(data)

In [4]:
jobs = pd.read_csv('./module2-vector-representations/data/job_listings.csv')
jobs = jobs[jobs['title'] == 'Data Scientist'].drop(['title', jobs.columns.to_list()[0]], axis=1).reset_index().drop(['index'], axis=1)
jobs.head()

Unnamed: 0,description
0,"b'<div class=""jobsearch-JobMetadataHeader icl-..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...
2,b'<div>Create various Business Intelligence An...
3,"b""Everytown for Gun Safety, the nation's large..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2..."


In [5]:
my_nlp = NlpPipeline(jobs, 'description', None)

In [6]:
my_nlp.Tokenize()  # testing if exception works

WrongOrderException: Call .Clean() first!

In [7]:
my_nlp.display()

Unnamed: 0,description
0,"b'<div class=""jobsearch-JobMetadataHeader icl-..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...
2,b'<div>Create various Business Intelligence An...
3,"b""Everytown for Gun Safety, the nation's large..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2..."
...,...
145,b'<div><p>FinLocker is a leading financial dat...
146,"b'<div><p>With annual sales of $15 billion, Ec..."
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,..."


In [8]:
my_nlp.Clean()

Unnamed: 0,description,cleaned
0,"b'<div class=""jobsearch-JobMetadataHeader icl-...",b4969 6756 a monthContractUnder the general s...
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,bLocation USA xe2x80x93 multiple locationsn2 y...
2,b'<div>Create various Business Intelligence An...,bCreate various Business Intelligence Analytic...
3,"b""Everytown for Gun Safety, the nation's large...",bEverytown for Gun Safety the nations largest ...
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2...",bnfosysxe2x80x93 Data Analytics xe2x80x93 Sr ...
...,...,...
145,b'<div><p>FinLocker is a leading financial dat...,bFinLocker is a leading financial data and ana...
146,"b'<div><p>With annual sales of $15 billion, Ec...",bWith annual sales of 15 billion Ecolab ECL is...
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...,bJob DescriptionnnThe Enterprise Data Solution...
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,...",bDescriptionnChicago IL IL150SW 150 S Wacker ...


In [9]:
my_nlp.Tokenize()

Unnamed: 0,description,cleaned,tokenized
0,"b'<div class=""jobsearch-JobMetadataHeader icl-...",b4969 6756 a monthContractUnder the general s...,"[b4969, , 6756, monthcontractunder, general, ..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,bLocation USA xe2x80x93 multiple locationsn2 y...,"[bLocation, USA, xe2x80x93, multiple, location..."
2,b'<div>Create various Business Intelligence An...,bCreate various Business Intelligence Analytic...,"[bcreate, Business, Intelligence, Analytical, ..."
3,"b""Everytown for Gun Safety, the nation's large...",bEverytown for Gun Safety the nations largest ...,"[bEverytown, Gun, Safety, nation, large, gun, ..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2...",bnfosysxe2x80x93 Data Analytics xe2x80x93 Sr ...,"[bnfosysxe2x80x93, Data, , analytic, xe2x80x9..."
...,...,...,...
145,b'<div><p>FinLocker is a leading financial dat...,bFinLocker is a leading financial data and ana...,"[bFinLocker, lead, financial, datum, analytic,..."
146,"b'<div><p>With annual sales of $15 billion, Ec...",bWith annual sales of 15 billion Ecolab ECL is...,"[bwith, annual, sale, 15, billion, Ecolab, ECL..."
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...,bJob DescriptionnnThe Enterprise Data Solution...,"[bJob, DescriptionnnThe, Enterprise, Data, Sol..."
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,...",bDescriptionnChicago IL IL150SW 150 S Wacker ...,"[bDescriptionnChicago, , IL, IL150SW, 150, S,..."


In [10]:
my_nlp.VectorizeCount().most_common(10)

[('datum', 934),
 (' ', 469),
 ('work', 439),
 ('business', 398),
 ('experience', 353),
 ('team', 341),
 ('model', 288),
 ('data', 247),
 ('Data', 236),
 ('analysis', 213)]

In [11]:
my_nlp.Vectorize()

Unnamed: 0,10,10 apps,10 countries,10 hours,10 military,10 of,10 time,10 yearsnnthe,100,100 clean,...,zenreach,zenreach products,zeus,zeus founders,zeus has,zeus is,zheng,zheng the,zoom,zoom out
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.051814,0.051814,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [None]:
### DAY THREE

In [None]:
kaggle = pd.read_csv('./module3-document-classification/data/train.csv')
kaggle.head()

In [None]:
kaggle_pipeline = NlpPipeline(kaggle, 'description', 'ratingCategory')
kaggle_pipeline.display()

In [None]:
kaggle_pipeline.Clean()

In [None]:
kaggle_pipeline.Tokenize()

In [None]:
kaggle_pipeline.VectorizeCount()

In [None]:
kaggle_pipeline.Vectorize()

In [None]:
wc = pd.DataFrame(kaggle_pipeline.word_counts.most_common(1000), columns=['word', 'count'])
display(wc.dtypes)
graph = wc[wc['count'] >= 1000]
display(graph)

In [None]:
import matplotlib.pyplot as plt
plt.bar(graph['word'], graph['count']);

In [None]:
test = pd.read_csv('./module3-document-classification/data/test.csv')
test.shape

In [None]:
clf = RandomForestClassifier()

kaggle_pipeline.MakePipeline([
        ('clf', clf)
    ])

parameters = {
    'clf__max_depth':(5,10,15,20)
}

In [None]:
kaggle_pipeline.GridSearchCV(parameters)

In [None]:
y_pred = kaggle_pipeline.Predict(test['description'])
y_pred

In [None]:
pd.Series(y_pred).value_counts()

In [None]:
#kaggle_submission = pd