In [1]:
import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD
from urllib.parse import unquote
sc

In [2]:
#Creating RDDs
good = sc.textFile("goodqueries.txt").map(lambda line: unquote(line)).distinct()
bad = sc.textFile("badqueries.txt").map(lambda line: unquote(line)).distinct()

In [3]:
print('Amount of entries')
good.count()+bad.count()

Amount of entries


1310506

# Preprocess Data
## Feature extarcting
models can't evaluate string so we will want to turn the string into numerical vectors using the HashinTF provided by spark.First of all, words won't helps in http queries so we will turn each query into bigrams to exapnad the and treating each by gram as a word. This approach will expand the word bucket and the feature space.An Example for bigram is given below.


In [4]:
def to_ngram(payload_obj,n=1):
    payload = str(payload_obj)
    ngrams = []
    for i in range(0,len(payload)-n+1):
        ngrams.append(payload[i:i+n])
    return ngrams

print('EXAMPLE: bigram of the word <script>:')
to_ngram("<script>",2)

EXAMPLE: bigram of the word <script>:


['<s', 'sc', 'cr', 'ri', 'ip', 'pt', 't>']

In [5]:
n = 2 #bigram
good_ngrams = good.map(lambda query: to_ngram(query,n))
bad_ngrams = bad.map(lambda query: to_ngram(query,n))

###### hashing each query into 1,000 word bucket.
As you can see, each query is turned into a sparse vector holding bucket numbers and occurrences this vector will be used as a feature vecctor input for our models
###### pyspark mllib models needs LabeledPoint as an input 
So, the next step is to label our features: 1 for bad query, 0 for good query. The result is a collected of labeled samples which are ready for use.


In [6]:
numFeatures = 2000
hahsingTF = HashingTF(numFeatures = numFeatures)

In [8]:
good_tf = hahsingTF.transform(good)
bad_tf = hahsingTF.transform(bad)

good_tf.cache()
bad_tf.cache()

idf = IDF().fit(good_tf.union(bad_tf))
good_tfidf = idf.transform(good_tf).map(lambda x: LabeledPoint(0.0, x))
bad_tfidf = idf.transform(bad_tf).map(lambda x: LabeledPoint(1.0, x))

example_input = good_tfidf.take(1)[0]
print(f'feature vector: ({example_input.features},{example_input.label})')


feature vector: ((2000,[104,136,201,744,1289,1511,1556,1683],[0.7005679684456778,1.1262072621936845,1.2943791186220295,0.6910471473539014,1.2513543383584065,0.014944590529120573,0.868974984310208,0.9827909772435368]),0.0)


In [9]:
[training_data, test_data] = good_tfidf.union(bad_tfidf).randomSplit([0.8, 0.2])
training_data.cache()
test_data.cache()


def evaluate_accuracy(model):
    predictions = model.predict(test_data.map(lambda x: x.features))
    labels_and_preds = test_data.map(lambda x: x.label).zip(predictions)
    accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
    return accuracy



In [10]:
from pyspark.mllib.classification import NaiveBayes   

model = NaiveBayes.train(training_data)
evaluate_accuracy(model)


0.9875050505820557

In [11]:
model.predict(idf.transform(hahsingTF.transform('hi')))

0.0

In [12]:
model.predict(idf.transform(hahsingTF.transform('ricky')))

0.0

In [13]:
model.predict(idf.transform(hahsingTF.transform('yuval motek')))

0.0

In [14]:
model.predict(idf.transform(hahsingTF.transform('stuff=\'uname >q36497765 #')))

1.0

In [None]:
# from pyspark.mllib.feature import HashingTF, IDF
# from pyspark.mllib.regression import LabeledPoint
# from pyspark.mllib.classification import NaiveBayes   

# training_raw = sc.parallelize([
#     {"text": "foo foo foo bar bar protein", "label": 1.0},
#     {"text": "foo bar dna for bar", "label": 0.0},
#     {"text": "foo bar foo dna foo", "label": 0.0},
#     {"text": "bar foo protein foo ", "label": 1.0}])


# # Split data into labels and features, transform
# # preservesPartitioning is not really required
# # since map without partitioner shouldn't trigger repartitiong
# labels = training_raw.map(
#     lambda doc: doc["label"],  # Standard Python dict access 
#     preservesPartitioning=True # This is obsolete.
# )

# tf = HashingTF(numFeatures=100).transform( ## Use much larger number in practice
#     training_raw.map(lambda doc: doc["text"].split(), 
#     preservesPartitioning=True))

# idf = IDF().fit(tf)
# tfidf = idf.transform(tf)

# # Combine using zip
# training = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))

# # Train and check
# model = NaiveBayes.train(training)
# labels_and_preds = labels.zip(model.predict(tfidf)).map(
#     lambda x: {"actual": x[0], "predicted": float(x[1])})


# from pyspark.mllib.evaluation import MulticlassMetrics
# from operator import itemgetter

# metrics = MulticlassMetrics(
#     labels_and_preds.map(itemgetter("actual", "predicted")))

# metrics.confusionMatrix().toArray()
# ## array([[ 2.,  0.],
# ##        [ 0.,  2.]])

In [None]:
LabeledPoint?