# ISA 414 - Managing Big Data
## Lecture 14 – Text Mining (Part I)

#### Slide 32: Make sure you install the *sklearn* and *pandas* modules first by running **pip install sklearn** and **pip install pandas** in the Terminal.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'jazz music has a swing rhythm',
    'swing is hard to explain',
    'swing rhythm is a natural rhythm',
]
vectorizer   = TfidfVectorizer()
tfidf_values = vectorizer.fit_transform(corpus)

# the results will be in "sparse-matrix" notation (x,y)
# x = document number (there are three documents in our corpus)
# y = column number (recall that each word in our corpus becomes a column)
print(tfidf_values)

  (0, 7)	0.3837699307603192
  (0, 8)	0.2980315863446099
  (0, 2)	0.5046113401371842
  (0, 5)	0.5046113401371842
  (0, 4)	0.5046113401371842
  (1, 0)	0.5046113401371842
  (1, 9)	0.5046113401371842
  (1, 1)	0.5046113401371842
  (1, 3)	0.3837699307603192
  (1, 8)	0.2980315863446099
  (2, 6)	0.48559571020624154
  (2, 3)	0.3693080540613576
  (2, 7)	0.7386161081227152
  (2, 8)	0.2868006489817671


#### Slide 33

In [2]:
import pandas

# creating a document-term matrix
dtm = pandas.DataFrame(tfidf_values.toarray())

# adding the column names to the matrix
dtm.columns = vectorizer.get_feature_names()
dtm



Unnamed: 0,explain,hard,has,is,jazz,music,natural,rhythm,swing,to
0,0.0,0.0,0.504611,0.0,0.504611,0.504611,0.0,0.38377,0.298032,0.0
1,0.504611,0.504611,0.0,0.38377,0.0,0.0,0.0,0.0,0.298032,0.504611
2,0.0,0.0,0.0,0.369308,0.0,0.0,0.485596,0.738616,0.286801,0.0


#### Slide 34 (stop words)

In [3]:
corpus = [
    "you won     $1000000 dollars",  
    "I love ISA 414 with all my heart", 
    "Improve your love life now: buy Viagra"
]

vectorizer = TfidfVectorizer(stop_words = "english")
tfidf_values = vectorizer.fit_transform(corpus)

df = pandas.DataFrame(tfidf_values.toarray())
df.columns = vectorizer.get_feature_names()

# creating a document-term matrix
dtm = pandas.DataFrame(tfidf_values.toarray())

# adding the column names to the matrix
dtm.columns = vectorizer.get_feature_names()

# note how words like 'with' and punctuation marks are removed
# moreover, all words are lowercase
dtm



Unnamed: 0,1000000,414,buy,dollars,heart,improve,isa,life,love,viagra,won
0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735
1,0.0,0.528635,0.0,0.0,0.528635,0.0,0.528635,0.0,0.40204,0.0,0.0
2,0.0,0.0,0.467351,0.0,0.0,0.467351,0.0,0.467351,0.355432,0.467351,0.0


In [4]:
# list of stop words that come with sklearn
print(vectorizer.get_stop_words())

frozenset({'for', 'whom', 'seems', 'even', 'afterwards', 'thick', 'both', 'out', 'besides', 'somehow', 'fifty', 're', 'me', 'next', 'mill', 'ours', 'hundred', 'amoungst', 'thru', 'him', 'around', 'thereby', 'my', 'show', 'take', 'been', 'those', 'became', 'every', 'moreover', 'often', 'may', 'such', 'until', 'hereupon', 'perhaps', 'should', 'herein', 'is', 'anyway', 'becoming', 'eg', 'couldnt', 'only', 'beforehand', 'noone', 'can', 'else', 'why', 'since', 'everyone', 'beyond', 'first', 'down', 'latterly', 'few', 'per', 'her', 'its', 'front', 'no', 'again', 'these', 'that', 'through', 'otherwise', 'would', 'serious', 'however', 'one', 'without', 'not', 'of', 'throughout', 'because', 'some', 'four', 'many', 'thus', 'more', 'along', 'she', 'whereafter', 'off', 'could', 'others', 'between', 'who', 'there', 'former', 'call', 'mostly', 'whence', 'with', 'once', 'had', 'detail', 'much', 'nothing', 'interest', 'in', 'ltd', 'wherein', 'cannot', 'he', 'and', 'while', 'yet', 'you', 'eight', 'anot

#### Slide 35 (n-grams)

In [5]:
corpus = [
    "you won     $1000000 dollars",  
    "I love ISA 414 with all my heart", 
    "Improve your love life now: buy Viagra"
]

# ngram_range defines the lower and upper boundary of the range of n-values for different n-grams to be extracted
# below we extract uni-grams (1) and bi-grams (2)
vectorizer = TfidfVectorizer(stop_words = "english", ngram_range = (1,2))
tfidf_values = vectorizer.fit_transform(corpus)

df = pandas.DataFrame(tfidf_values.toarray())
df.columns = vectorizer.get_feature_names()

# creating a document-term matrix
dtm = pandas.DataFrame(tfidf_values.toarray())

# adding the column names to the matrix
dtm.columns = vectorizer.get_feature_names()

# note how combinations of words, such as "414 heart" is part of the DTM
dtm



Unnamed: 0,1000000,1000000 dollars,414,414 heart,buy,buy viagra,dollars,heart,improve,improve love,isa,isa 414,life,life buy,love,love isa,love life,viagra,won,won 1000000
0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214
1,0.0,0.0,0.389888,0.389888,0.0,0.0,0.0,0.389888,0.0,0.0,0.389888,0.389888,0.0,0.0,0.29652,0.389888,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.341426,0.341426,0.0,0.0,0.341426,0.341426,0.0,0.0,0.341426,0.341426,0.259663,0.0,0.341426,0.341426,0.0,0.0
