In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from IPython.display import display

class NLP:
    
    class WrongOrderException(Exception):
        pass
    
    def __init__(self, df, text_col, clean_name='cleaned', token_name='tokens'):
        self.container = df
        self.data_name = text_col
        
        col_names = df.columns.to_list()
        
        if clean_name in col_names:
            print("WARNING: Value provided for parameter 'clean_name' is already a column.")
        
        self.clean_name = clean_name
            
        if token_name in col_names:
            print("WARNING: Value provided for parameter 'token_name' is already a column.")
        
        self.token_name = token_name
        
    def display(self):
        display(self.container)
        
    def Clean(self):
        self.container[self.clean_name] = self.container[self.data_name].apply(lambda x: BeautifulSoup(x).get_text()).apply(lambda x: re.sub(r'[^a-zA-Z ^0-9]', '', x))
        return self.container
    
    def Tokenize(self):
        if self.clean_name not in self.container.columns.to_list():
            raise self.WrongOrderException('Call .Clean() first!')
        
        nlp = spacy.load("en_core_web_lg")
        self.container[self.token_name] = self.container[self.clean_name].apply(lambda x: [token.lemma_ for token in nlp(x) if (token.is_stop != True) and (token.is_punct != True)])
        return self.container
    
    def VectorizeCount(self):
        if self.token_name not in self.container.columns.to_list():
            raise NLP.WrongOrderException('Call .Tokenize() first!')
            
        self.word_counts = Counter()
        self.container[self.token_name].apply(lambda x: self.word_counts.update(x))
        return NLP.word_counts
    
    def TFIDF(self):
        vectorizer = CountVectorizer()
        dtm = vectorizer.fit_transform(self.container[self.data_name].to_list())
        self.tfidf = pd.DataFrame(dtm.todense(), columns=vectorizer.get_feature_names())
        return self.tfidf

In [2]:
jobs = pd.read_csv('./module2-vector-representations/data/job_listings.csv')
jobs = jobs[jobs['title'] == 'Data Scientist'].drop(['title', jobs.columns.to_list()[0]], axis=1).reset_index().drop(['index'], axis=1)
jobs.head()

Unnamed: 0,description
0,"b'<div class=""jobsearch-JobMetadataHeader icl-..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...
2,b'<div>Create various Business Intelligence An...
3,"b""Everytown for Gun Safety, the nation's large..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2..."


In [3]:
my_nlp = NLP(jobs, 'description')

In [4]:
my_nlp.Tokenize()

WrongOrderException: Call .Clean() first!

In [5]:
my_nlp.display()

Unnamed: 0,description
0,"b'<div class=""jobsearch-JobMetadataHeader icl-..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...
2,b'<div>Create various Business Intelligence An...
3,"b""Everytown for Gun Safety, the nation's large..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2..."
...,...
145,b'<div><p>FinLocker is a leading financial dat...
146,"b'<div><p>With annual sales of $15 billion, Ec..."
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,..."


In [6]:
my_nlp.Clean()

Unnamed: 0,description,cleaned
0,"b'<div class=""jobsearch-JobMetadataHeader icl-...",b4969 6756 a monthContractUnder the general s...
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,bLocation USA xe2x80x93 multiple locationsn2 y...
2,b'<div>Create various Business Intelligence An...,bCreate various Business Intelligence Analytic...
3,"b""Everytown for Gun Safety, the nation's large...",bEverytown for Gun Safety the nations largest ...
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2...",bnfosysxe2x80x93 Data Analytics xe2x80x93 Sr ...
...,...,...
145,b'<div><p>FinLocker is a leading financial dat...,bFinLocker is a leading financial data and ana...
146,"b'<div><p>With annual sales of $15 billion, Ec...",bWith annual sales of 15 billion Ecolab ECL is...
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...,bJob DescriptionnnThe Enterprise Data Solution...
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,...",bDescriptionnChicago IL IL150SW 150 S Wacker ...


In [7]:
my_nlp.Tokenize()

Unnamed: 0,description,cleaned,tokens
0,"b'<div class=""jobsearch-JobMetadataHeader icl-...",b4969 6756 a monthContractUnder the general s...,"[b4969, , 6756, monthcontractunder, general, ..."
1,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,bLocation USA xe2x80x93 multiple locationsn2 y...,"[bLocation, USA, xe2x80x93, multiple, location..."
2,b'<div>Create various Business Intelligence An...,bCreate various Business Intelligence Analytic...,"[bcreate, Business, Intelligence, Analytical, ..."
3,"b""Everytown for Gun Safety, the nation's large...",bEverytown for Gun Safety the nations largest ...,"[bEverytown, Gun, Safety, nation, large, gun, ..."
4,"b""nfosys\xe2\x80\x93 Data &amp; Analytics \xe2...",bnfosysxe2x80x93 Data Analytics xe2x80x93 Sr ...,"[bnfosysxe2x80x93, Data, , analytic, xe2x80x9..."
...,...,...,...
145,b'<div><p>FinLocker is a leading financial dat...,bFinLocker is a leading financial data and ana...,"[bFinLocker, lead, financial, datum, analytic,..."
146,"b'<div><p>With annual sales of $15 billion, Ec...",bWith annual sales of 15 billion Ecolab ECL is...,"[bwith, annual, sale, 15, billion, Ecolab, ECL..."
147,b'<div>Job Description:<br/>\n<br/>\nThe Enter...,bJob DescriptionnnThe Enterprise Data Solution...,"[bJob, DescriptionnnThe, Enterprise, Data, Sol..."
148,"b""<div>Description:\n<p>Chicago - IL, IL150SW,...",bDescriptionnChicago IL IL150SW 150 S Wacker ...,"[bDescriptionnChicago, , IL, IL150SW, 150, S,..."


In [8]:
my_nlp.VectorizeCount().most_common(10)

AttributeError: type object 'NLP' has no attribute 'word_counts'

In [None]:
my_nlp.TFIDF()