In [1]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.4-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setMaster("local").setAppName("revisionapp").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)


In [2]:
from pyspark.mllib.feature import HashingTF, IDF

In [5]:
documents = sc.textFile(r"C:\spark\data\mllib\kmeans_data.txt").map(lambda line: line.split(" "))

In [11]:
hashingTF = HashingTF()
tf = hashingTF.transform(documents)

In [12]:
# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [13]:
# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
idfignore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfignore.transform(tf)


In [14]:
#print both tfidf with ignore and without ignore
print("tfidf:")
for each in tfidf.collect():
    print(each)

tfidf:
(1048576,[28392],[3.758288905486104])
(1048576,[929338],[3.758288905486104])
(1048576,[755156],[3.758288905486104])
(1048576,[93645],[3.758288905486104])
(1048576,[964576],[3.758288905486104])
(1048576,[571790],[3.758288905486104])


In [15]:
print("tfidfIgnore:")
for each in tfidfIgnore.collect():
    print(each)

tfidfIgnore:
(1048576,[28392],[0.0])
(1048576,[929338],[0.0])
(1048576,[755156],[0.0])
(1048576,[93645],[0.0])
(1048576,[964576],[0.0])
(1048576,[571790],[0.0])
