In [2]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt


spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)


22/09/01 18:27:45 WARN Utils: Your hostname, Kes-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.225.200 instead (on interface en0)
22/09/01 18:27:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/09/01 18:27:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/01 18:27:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/01 18:27:46 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/09/01 18:27:46 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.2
      /_/

Using Python version 3.9.7 (default, Sep 16 2021 08:50:36)
Spark context Web UI available at http://10.13.225.200:4043
Spark context available as 'sc' (master = local[*], app id = local-1662020866826).
SparkSession available as 'spark'.


In [10]:
sdf = spark.read.parquet("../data/tables/tbl_merchants.parquet")
# sdf.printSchema
sdf.distinct().count()


4026

In [13]:
# convert to pandas dataframe for convenience
df = sdf.toPandas()


In [22]:
import re


def extract_tags(arr, category="tags"):

    # Split tags into the three components
    arr = arr[1:-1]
    split_arr = re.split("\), \(|\], \[", arr.strip("[()]"))

    if category == "take_rate":
        return re.findall("[\d\.\d]+", split_arr[2])[0]

    elif category == "revenue_level":
        return split_arr[1].lower()

    return split_arr[0].lower()


In [55]:
df["tag"] = df.apply(lambda row: extract_tags(row.tags, "tags"), axis=1)
df["revenue_lvl"] = df.apply(
    lambda row: extract_tags(row.tags, "revenue_level"), axis=1
)
df["take_rate"] = df.apply(lambda row: extract_tags(row.tags, "take_rate"), axis=1)

# df.head(1)
tag_col = df["tag"].copy()

In [119]:
# preprocess the data, with the following pipeline
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('words')
from autocorrect import Speller

# use a class notion so only need to init corpus words and lemmatizer once
# API: preprocess, which takes a single entry of tag, and returns preprocessed tag
class Preprocessor:
    def __init__(self, correct_method="auto"):
        self.lemmatizer = WordNetLemmatizer()
        self.correct_words = words.words()
        self.correct_method = correct_method

        if correct_method == "auto":
            self.auto_corrector = Speller(lang='en')
    
    # jaccard distance is better for minor typos
    def __correct_spelling__(self,word):
        # match the first character
        if self.correct_method == "jaccard":
            similarity_list = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        if self.correct_method == "edit_distance":
            similarity_list = [(edit_distance(word,w),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        else:
            return self.auto_corrector(word)


    # case standardization -> puncuation, number removal -> 
    # tokenize -> spelling correction -> lemmatization -> minimum length
    def preprocess(self,tag):
        #case standardization
        tag = tag.lower()
        #puncuation, number removal
        tag= re.sub(r'[^a-zA-Z\s]', '',tag)
        #tokenize 
        tag = word_tokenize(tag)

        #correct spelling
        tag = [self.__correct_spelling__(word) for word in tag ]
        #lemmatization
        # tag = [self.lemmatizer.lemmatize(word) for word in tag]

        #minimum length of 2
        tag = [word for word in tag if len(word) > 2]

        #join the word
        tag = " ".join(word for word in tag)
        return tag


[nltk_data] Downloading package words to /Users/ke/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [113]:
# p = preprocessor()
# p.__correct_spelling__("friuts")
# # p.preprocess("appleee ,")


'fruits'

In [120]:
preprocessor = Preprocessor()
for i in range(tag_col.size):
    tag_col[i] = preprocessor.preprocess(tag_col[i])
    

In [122]:
tag_col.to_csv("../data/curated/tag_col_preprocessed.csv")

In [43]:
unigram_tfidf_vectorizer = TfidfVectorizer(
    stop_words="english", max_df=0.5, min_df=1, ngram_range=(1, 1), max_features=100000
)
unigram_tfidf_vectorizer.fit_transform(tag_col)

tag_col


0       furniturehomefurnishingsandequipmentshopsandma...
1       cablesatelliteandotherpaytelevisionandradioser...
2                     jewelrywatchclockandsilverwareshops
3                         watchclockandjewelryrepairshops
4         musicshopsmusicalinstrumentspianosandsheetmusic
                              ...                        
4021                   opticiansopticalgoodsandeyeglasses
4022                        booksperiodicalsandnewspapers
4023                                            shoeshops
4024                      motorvehiclesuppliesandnewparts
4025                      motorvehiclesuppliesandnewparts
Name: tag, Length: 4026, dtype: object