In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Text") \
    .getOrCreate()

In [7]:
txt = spark.read.load('data/desc.csv',
                format='com.databricks.spark.csv',
                header='true',
                inferSchema='true')\
          .select('text')
txt.show()

+--------------------+
|                text|
+--------------------+
|Data (/ˈdeɪtə/ DA...|
|Big data is a ter...|
|Natural language ...|
|Text mining, also...|
+--------------------+



In [122]:
# !pip install langid 
# !pip install textblob
# !pip install nltk
import re
import string

import nltk
nltk.download("stopwords")
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
  
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag

from textblob import TextBlob

import langid

langid.classify('Big data is love')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


('en', -22.928163528442383)

In [94]:
# text preprocessing
def check_blanks(data_str):
    is_blank = str(data_str.isspace())
    return is_blank


def check_lang(data_str):
    predict_lang = langid.classify(data_str)
    language = predict_lang[0]
    return language

def remove_features(data_str):
    # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 3 characters
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = word
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 2:
                cleaned_str = cleaned_str + ' ' + word
            else:
                cleaned_str += ' '
        list_pos += 1
    return cleaned_str


def remove_stops(data_str):
    # expects a string
    stops = set(stopwords.words("english"))
    list_pos = 0
    cleaned_str = ''
    text = data_str.split()
    for word in text:
        if word not in stops:
            # rebuild cleaned_str
            if list_pos == 0:
                cleaned_str = word
            else:
                cleaned_str = cleaned_str + ' ' + word
            list_pos += 1
    return cleaned_str

def tag_and_remove(data_str):
    cleaned_str = ' '
    # noun tags
    nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
    # adjectives
    jj_tags = ['JJ', 'JJR', 'JJS']
    # verbs
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    nltk_tags = nn_tags + jj_tags + vb_tags

    # break string into 'words'
    text = data_str.split()

    # tag the text and keep only those with the right tags
    tagged_text = pos_tag(text)
    for tagged_word in tagged_text:
        if tagged_word[1] in nltk_tags:
            cleaned_str += tagged_word[0] + ' '

    return cleaned_str


def lemmatize(data_str):
    # expects a string
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    text = data_str.split()
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

In [95]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


check_lang_udf = udf(check_lang, StringType())
remove_stops_udf = udf(remove_stops, StringType())
remove_features_udf = udf(remove_features, StringType())
tag_and_remove_udf = udf(tag_and_remove, StringType())
lemmatize_udf = udf(lemmatize, StringType())
check_blanks_udf = udf(check_blanks, StringType())

In [96]:
# определим язык
lang_df = txt.withColumn("lang", check_lang_udf(txt["text"]))
en_df = lang_df.filter(lang_df["lang"] == "en")
en_df.show(5)

+--------------------+----+
|                text|lang|
+--------------------+----+
|Data (/ˈdeɪtə/ DA...|  en|
|Big data is a ter...|  en|
|Natural language ...|  en|
|Text mining, also...|  en|
+--------------------+----+



In [97]:
# удалим стоп слова
rm_stops_df = en_df.select('text')\
                   .withColumn("stop_text", remove_stops_udf(en_df["text"]))
rm_stops_df.show(5)

+--------------------+--------------------+
|                text|           stop_text|
+--------------------+--------------------+
|Data (/ˈdeɪtə/ DA...|Data (/ˈdeɪtə/ DA...|
|Big data is a ter...|Big data term dat...|
|Natural language ...|Natural language ...|
|Text mining, also...|Text mining, also...|
+--------------------+--------------------+



In [98]:
rm_features_df = rm_stops_df.select(['text']+["stop_text"])\
                            .withColumn("feat_text", \
                            remove_features_udf(rm_stops_df["stop_text"]))
rm_features_df.show(5)

+--------------------+--------------------+--------------------+
|                text|           stop_text|           feat_text|
+--------------------+--------------------+--------------------+
|Data (/ˈdeɪtə/ DA...|Data (/ˈdeɪtə/ DA...|data  day      da...|
|Big data is a ter...|Big data term dat...|big data term dat...|
|Natural language ...|Natural language ...|natural language ...|
|Text mining, also...|Text mining, also...|text mining also ...|
+--------------------+--------------------+--------------------+



In [99]:
tagged_df = rm_features_df.select(['text']+["feat_text"]) \
                          .withColumn("tagged_text", \
                           tag_and_remove_udf(rm_features_df.feat_text))

tagged_df.show(5)

+--------------------+--------------------+--------------------+
|                text|           feat_text|         tagged_text|
+--------------------+--------------------+--------------------+
|Data (/ˈdeɪtə/ DA...|data  day      da...| data day dah set...|
|Big data is a ter...|big data term dat...| big data term da...|
|Natural language ...|natural language ...| natural language...|
|Text mining, also...|text mining also ...| text mining refe...|
+--------------------+--------------------+--------------------+



In [100]:
lemm_df = tagged_df.select(['text']+["tagged_text"]) \
                   .withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))
lemm_df.show(5)

+--------------------+--------------------+--------------------+
|                text|         tagged_text|           lemm_text|
+--------------------+--------------------+--------------------+
|Data (/ˈdeɪtə/ DA...| data day dah set...|data day dah set ...|
|Big data is a ter...| big data term da...|big data term dat...|
|Natural language ...| natural language...|natural language ...|
|Text mining, also...| text mining refe...|text mining refer...|
+--------------------+--------------------+--------------------+



In [107]:
# check_blanks_df = lemm_df.select(['text']+["lemm_text"])\
#                          .withColumn("is_blank", check_blanks_udf(lemm_df["lemm_text"]))
# # remove blanks
# no_blanks_df = check_blanks_df.filter(check_blanks_df["is_blank"] == "False")

# # drop duplicates
# dedup_df = no_blanks_df.dropDuplicates('text')

# dedup_df.show(4)

In [111]:
from pyspark.sql.functions import monotonically_increasing_id
# Create Unique ID
dedup_df = lemm_df.withColumn("uid", monotonically_increasing_id())
dedup_df.show(5)

+--------------------+--------------------+--------------------+---+
|                text|         tagged_text|           lemm_text|uid|
+--------------------+--------------------+--------------------+---+
|Data (/ˈdeɪtə/ DA...| data day dah set...|data day dah set ...|  0|
|Big data is a ter...| big data term da...|big data term dat...|  1|
|Natural language ...| natural language...|natural language ...|  2|
|Text mining, also...| text mining refe...|text mining refer...|  3|
+--------------------+--------------------+--------------------+---+



In [112]:
data = dedup_df.select('uid','text','lemm_text')
data.show(5)

+---+--------------------+--------------------+
|uid|                text|           lemm_text|
+---+--------------------+--------------------+
|  0|Data (/ˈdeɪtə/ DA...|data day dah set ...|
|  1|Big data is a ter...|big data term dat...|
|  2|Natural language ...|natural language ...|
|  3|Text mining, also...|text mining refer...|
+---+--------------------+--------------------+



In [None]:
## Sentimental

In [118]:
# remove non ASCII characters
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in data_str if 0 < ord(c) < 127)
    return ''.join(stripped)
# setup pyspark udf function
strip_non_ascii_udf = udf(strip_non_ascii, StringType())

In [119]:
df = data.withColumn('text_non_asci',strip_non_ascii_udf(data['lemm_text']))
df.show(5,True)

+---+--------------------+--------------------+--------------------+
|uid|                text|           lemm_text|       text_non_asci|
+---+--------------------+--------------------+--------------------+
|  0|Data (/ˈdeɪtə/ DA...|data day dah set ...|data day dah set ...|
|  1|Big data is a ter...|big data term dat...|big data term dat...|
|  2|Natural language ...|natural language ...|natural language ...|
|  3|Text mining, also...|text mining refer...|text mining refer...|
+---+--------------------+--------------------+--------------------+



In [124]:
from pyspark.sql.types import FloatType

def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

sentiment_analysis_udf = udf(sentiment_analysis , FloatType())

In [125]:
df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['text_non_asci'] ))
df.show(5,True)

+---+--------------------+--------------------+--------------------+---------------+
|uid|                text|           lemm_text|       text_non_asci|sentiment_score|
+---+--------------------+--------------------+--------------------+---------------+
|  0|Data (/ˈdeɪtə/ DA...|data day dah set ...|data day dah set ...|            0.3|
|  1|Big data is a ter...|big data term dat...|big data term dat...|   -0.017142856|
|  2|Natural language ...|natural language ...|natural language ...|    0.057359308|
|  3|Text mining, also...|text mining refer...|text mining refer...|    0.013416667|
+---+--------------------+--------------------+--------------------+---------------+



In [127]:
def condition(r):
    if (r >=0.1):
        label = "positive"
    elif(r <= -0.1):
        label = "negative"
    else:
        label = "neutral"
    return label

sentiment_udf = udf(lambda x: condition(x), StringType())

In [128]:
df  = df.withColumn("sentiment_type", sentiment_udf( df['sentiment_score'] ))
df.show(5,True)

+---+--------------------+--------------------+--------------------+---------------+--------------+
|uid|                text|           lemm_text|       text_non_asci|sentiment_score|sentiment_type|
+---+--------------------+--------------------+--------------------+---------------+--------------+
|  0|Data (/ˈdeɪtə/ DA...|data day dah set ...|data day dah set ...|            0.3|      positive|
|  1|Big data is a ter...|big data term dat...|big data term dat...|   -0.017142856|       neutral|
|  2|Natural language ...|natural language ...|natural language ...|    0.057359308|       neutral|
|  3|Text mining, also...|text mining refer...|text mining refer...|    0.013416667|       neutral|
+---+--------------------+--------------------+--------------------+---------------+--------------+



###

- если есть Label

In [114]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilderx
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.feature import CountVectorizer

In [115]:
# Split
(trainingData, testData) = data.randomSplit([0.6, 0.4])

In [117]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="lemm_text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")
idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features")

# Naive Bayes model
nb = NaiveBayes()

# Pipeline Architecture
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

IllegalArgumentException: label does not exist. Available: uid, text, lemm_text, words, rawFeatures, features