In [6]:
import pandas as pd
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
def get_and_clean_data():
    data = pd.read_csv('../data/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [3]:
def clean_des() :
    cleaned_description = get_and_clean_data()
    cleaned_description = cleaned_description.iloc[:2]
    
    tokenized_description = cleaned_description.apply(lambda s : word_tokenize(s))
    
    sw_removed_description = tokenized_description.apply(
        lambda s : [
            word for word in s if word not in stopwords.words()
        ]
    )
    
    sw_removed_description = sw_removed_description.apply(
        lambda s: [
            word for word in s if len(word) > 2
        ]
    )
    
    ps = PorterStemmer()
    
    stemmed_description = sw_removed_description.apply(
        lambda s: [
            ps.stem(w) for w in s
        ]
    )
    
    return stemmed_description
    


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

stemmed_description = clean_des()

cv = CountVectorizer(analyzer=lambda x:x)
X = cv.fit_transform(stemmed_description)

print(pd.DataFrame(X.toarray()))

   0    1    2    3    4    5    6    7    8    9    ...  277  278  279  280  \
0    0    1    1    1    1    0    1    1    1    0  ...    1    1    3    1   
1    1    0    0    0    0    1    0    0    0    1  ...    0    0    1    0   

   281  282  283  284  285  286  
0    2    1    0    5    1    0  
1    1    0    1    1    0    1  

[2 rows x 287 columns]


In [9]:
print(X.tocsr()[0,:])

  (0, 43)	1
  (0, 239)	10
  (0, 76)	12
  (0, 182)	1
  (0, 139)	1
  (0, 86)	4
  (0, 256)	1
  (0, 155)	5
  (0, 77)	2
  (0, 270)	2
  (0, 277)	1
  (0, 179)	1
  (0, 127)	1
  (0, 221)	1
  (0, 73)	5
  (0, 46)	2
  (0, 223)	2
  (0, 281)	2
  (0, 268)	2
  (0, 259)	7
  (0, 70)	1
  (0, 202)	1
  (0, 104)	1
  (0, 47)	1
  (0, 260)	1
  :	:
  (0, 280)	1
  (0, 147)	1
  (0, 218)	1
  (0, 209)	1
  (0, 11)	2
  (0, 162)	1
  (0, 14)	1
  (0, 134)	2
  (0, 132)	1
  (0, 80)	1
  (0, 187)	2
  (0, 37)	1
  (0, 8)	1
  (0, 83)	1
  (0, 12)	1
  (0, 175)	1
  (0, 117)	1
  (0, 207)	2
  (0, 190)	2
  (0, 60)	2
  (0, 149)	1
  (0, 36)	1
  (0, 191)	1
  (0, 240)	1
  (0, 189)	1


In [11]:
import timeit
import numpy as np
timeit.timeit(lambda: np.matmul(X.toarray(),X.toarray().T),number=1)
np.shape(np.matmul(X.toarray(),X.toarray().T))

timeit.timeit(lambda: X*X.T,number=1)
np.shape(X*X.T)

(2, 2)

In [12]:
timeit.timeit(lambda: X*X.T,number=1)

0.0016618999999877815

In [13]:
timeit.timeit(lambda: X.todok()*X.T.todok(),number=1)

0.04682109999998829

In [14]:
timeit.timeit(lambda: X.tolil()*X.T.tolil(),number=1)

0.0373288000000116

In [15]:
timeit.timeit(lambda: X.tocoo()*X.T.tocoo(),number=1)

0.004357599999991635

In [16]:
timeit.timeit(lambda: X.tocsc()*X.T.tocsc(),number=1)

0.001076299999965613