# Module3 Hand on

## prepraring python library

In [6]:
import pandas as pd # do some data
import string
import timeit # just import for timer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

---

## previous code from previous module

### Retrive Data from .csv

In [1]:
def get_and_clean_data():
    data = pd.read_csv('./data/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

---

# Hand on #1 the bag of words

## A suggested Scikit-learn approach

### Setting up the preProcessor

In [3]:
def preProcess(s):
    ps = PorterStemmer()
    s = word_tokenize(s)
    stopwords_set = set(stopwords.words())
    stop_dict = {s: 1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

### Do some query ( with java before python)

In [4]:
def sk_vectorize():
    cleaned_description = get_and_clean_data()
    vectorizer = CountVectorizer(preprocessor=preProcess)
    vectorizer.fit_transform(cleaned_description)
    query = vectorizer.transform(['good at java and python'])
    print(query)
    print(vectorizer.inverse_transform(query))

In [7]:
sk_vectorize()

  (0, 15571)	1
  (0, 18608)	1
  (0, 26294)	1
[array(['good', 'java', 'python'], dtype='<U182')]


### Do another query ( with python before java)

In [8]:
def sk_vectorize2():
    cleaned_description = get_and_clean_data()
    vectorizer = CountVectorizer(preprocessor=preProcess)
    vectorizer.fit_transform(cleaned_description)
    query = vectorizer.transform(['good at python and java'])
    print(query)
    print(vectorizer.inverse_transform(query))

In [9]:
sk_vectorize2()

  (0, 15571)	1
  (0, 18608)	1
  (0, 26294)	1
[array(['good', 'java', 'python'], dtype='<U182')]


#### you can see that the result from `sk_vectorize()` and `sk_vectorize2()` are the same

### Handing sequence

In [10]:
def sk_vectorize3():
    cleaned_description = get_and_clean_data()
    vectorizer = CountVectorizer(preprocessor=preProcess,ngram_range=(1,2))
    vectorizer.fit_transform(cleaned_description)
    query = vectorizer.transform(['good at python and java'])
    print(query)
    print(vectorizer.inverse_transform(query))

In [11]:
sk_vectorize3()

  (0, 168556)	1
  (0, 203737)	1
  (0, 302056)	1
  (0, 302204)	1
[array(['good', 'java', 'python', 'python java'], dtype='<U274')]


### The feature name

In [16]:
def sk_vectorize4():
    cleaned_description = get_and_clean_data()
    vectorizer = CountVectorizer(preprocessor=preProcess,ngram_range=(1,2))
    X = vectorizer.fit_transform(cleaned_description)
    print(vectorizer.get_feature_names())

In [17]:
sk_vectorize4()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### unforturnary my computer is not fast enough, T-T

---

# Hand on #2 tf-idf

### tf-idf weighting

In [18]:
N = 5
cleaned_des = get_and_clean_data()
cleaned_des = cleaned_des.iloc[:N]
vectez = CountVectorizer(preprocessor=preProcess)
X = vectez.fit_transform(cleaned_des)
print("Before transform:")
print(X.toarray())
print("=================")
print()

print("After transform:")
X.data = np.log10(X.data + 1)
X = X.multiply(np.log10(N / X.sum(0))[0])

print(X.toarray())
print("=================")
print()

print("The summary :")
print(pd.DataFrame(X.toarray(),columns=vectez.get_feature_names()))

Before transform:
[[0 1 1 ... 5 1 0]
 [1 0 0 ... 1 0 1]
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 1 0 1]]

After transform:
[[0.         0.36736504 0.36736504 ... 0.33466756 0.36736504 0.        ]
 [0.27674598 0.         0.         ... 0.12946708 0.         0.27674598]
 [0.         0.         0.         ... 0.20520047 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.27674598 0.         0.         ... 0.12946708 0.         0.27674598]]

The summary :
     110000     18000      1983       250    300000        34       510  \
0  0.000000  0.367365  0.367365  0.367365  0.367365  0.000000  0.000000   
1  0.276746  0.000000  0.000000  0.000000  0.000000  0.000000  0.276746   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.367365  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.276746  0.000000  0.000000  0.000000  0.000000  0.000000  0.276746   

      62304      8000  8882376835  .

### Scikit-learn builtin tf-idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
N = 5
cleaned_des2 = get_and_clean_data()
cleaned_des2 = cleaned_des2.iloc[:N]
vectz = TfidfVectorizer(preprocessor=preProcess)
X = vectz.fit_transform(cleaned_des2)
print(pd.DataFrame(X.toarray(),columns=vectz.get_feature_names()))

     110000     18000      1983       250    300000        34       510  \
0  0.000000  0.045564  0.045564  0.045564  0.045564  0.000000  0.000000   
1  0.074481  0.000000  0.000000  0.000000  0.000000  0.000000  0.074481   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.081564  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.074785  0.000000  0.000000  0.000000  0.000000  0.000000  0.074785   

      62304      8000  8882376835  ...    within   without      work  \
0  0.045564  0.045564    0.045564  ...  0.045564  0.036761  0.065135   
1  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.043990   
2  0.000000  0.000000    0.000000  ...  0.000000  0.065805  0.155462   
3  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.043996   
4  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.044169   

      would     write    writer       xml      year     yield      zaur  
0  0.036761  0.061030  0.045564  0.000000 

#### What a nice tf-idf built in from sklearn

---

# Hand on #3 BM25

## The base code from [`This github gist`](https://gist.github.com/koreyou/f3a8a0470d32aa56b32f198f49a9f2b8)

In [23]:
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        b, k1, avdl = self.b, self.k1, self.avdl

        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

### BM25 and Scikit-learn

In [28]:
from scipy import sparse

N = 5
cleaned_des3 = get_and_clean_data()
cleaned_des3 = cleaned_des3.iloc[:N]
bm25 = BM25()
bm25.fit(cleaned_des3)
print(bm25.transform('aws github',cleaned_des3))

[0.         0.         1.89200257 2.11154135 0.        ]


#### ended hand on 3