Bag of Words

In [1]:
import os
import numpy
import pandas
import tqdm
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import unidecode
import contractions
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from collections import Counter
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [2]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

22/10/10 22:44:46 WARN Utils: Your hostname, Runyus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.3.12 instead (on interface en0)
22/10/10 22:44:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/10 22:44:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/10 22:44:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/10 22:44:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
sdf = read_file(spark, 'merchants_data.parquet', '../data/curated/')

|> Loading File...


                                                                                

|> Loading Finished!
-RECORD 0-----------------------------------------------------------------------------------------
 merchant_name | Felis Limited                                                                    
 tags          | furniture, home furnishings and equipment shops, and manufacturers, except ap... 
 merchant_abn  | 10023283211                                                                      
 take_rate     | 0.18                                                                             
 type          | e                                                                                
only showing top 1 row



In [4]:
# clean string data

# Convert accented characters  return:string
def accented_char(text):
    text = unidecode.unidecode(text)
    return text

# expand_contractions return:string
def expand_contrac(text):
    text = contractions.fix(text)
    return text

# remove num and symbol return: string
def replace_num_and_symbol(text):
    text = text.lower()
    result = re.sub('[\W_]+', ' ', text)
    new_string = re.sub(r'([\d]+)([a-z]+)', '', result)
    new_string = re.sub(r'([a-z]+)([\d]+)', '', new_string)
    new_string = re.sub(r'[\d]+', '', new_string)
    return new_string

# remove stopwords 
def remove_stop_words(text):
    text_tokens = word_tokenize(text)
    all_stopwords = stopwords.words('english')
    token_without_sw = [word for word in text_tokens if not word in all_stopwords]
    # filter_sentence = ''.join(token_without_sw)
    return token_without_sw

# Lemmatisation
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
    
def word_lemmatizer(data_list):
    # tokens = word_tokenize(text)
    tagged_sent = pos_tag(data_list)
    lem_list = WordNetLemmatizer()
    word_list = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        word_list.append(lem_list.lemmatize(tag[0], pos=wordnet_pos))
    return word_list

def clean_string(text):
    # using all functions above to clean string
    remove_accent_char = accented_char(text)
    remove_extraction = expand_contrac(remove_accent_char)
    remove_num_and_symbol_string = replace_num_and_symbol(remove_extraction)
    filter_sentence = remove_stop_words(remove_num_and_symbol_string)
    word_list = word_lemmatizer(filter_sentence)
    clean_string = ' '.join(word_list)
    return clean_string

In [5]:
sdf = sdf.select('merchant_name').toPandas()

In [6]:
sdf

Unnamed: 0,merchant_name
0,Felis Limited
1,Arcu Ac Orci Corporation
2,Nunc Sed Company
3,Ultricies Dignissim Lacus Foundation
4,Enim Condimentum PC
...,...
4021,Elit Dictum Eu Ltd
4022,Mollis LLP
4023,Sociosqu Corp.
4024,Commodo Hendrerit LLC


In [7]:
text_info = [x[0] for x in sdf[['merchant_name']].values]
word_list = []
for item in text_info:
    text_tokens = word_tokenize(clean_string(item))
    for word in text_tokens:
        word_list.append(word)
word_counter = Counter(word_list)

In [8]:
new_counter = sorted(word_counter.items(), key=lambda d: d[1],reverse=True)

In [9]:
PLOT_PATH = '../plots/'

In [10]:
import pyecharts.options as opts
from pyecharts.charts import WordCloud
"""
Gallery 使用 pyecharts 1.1.0
参考地址: https://gallery.echartsjs.com/editor.html?c=xS1jMxuOVm

目前无法实现的功能:

1、暂无
"""
c = (
    WordCloud()
    .add(series_name="merchants_name_word", data_pair=new_counter, word_size_range=[6, 66])
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Analysis_of_merchants_name_words", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
)
c.render_notebook()