# Coursework 1: Question 2 - Vodacom Dataset

## Import Libraries

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import logging
import re

## Source Data

Get the source data to use for clustering.

### Data Load

Load customers reviews that was previously downloaded from Hellopeter in the [DSM020](https://github.com/JohnnyFoulds/dsm020-2021-oct) git repository.

In [2]:
df_source = pd.read_parquet('https://github.com/JohnnyFoulds/dsm020-2021-oct/blob/master/coursework_01/data/output/telecommunications.gzip?raw=true')
df_source.shape

(35072, 17)

In [3]:
df_source.head(2)

Unnamed: 0,id,created_at,author,author_id,review_title,review_rating,review_content,business_slug,permalink,replied,messages,industry_slug,nps_rating,author_created_date,author_total_reviews_count,review_title_clean,review_content_clean
0,3750417,2021-12-21 15:27:08,Barry S,05c6e290-6186-11ec-b1c0-cd74559df45d,Worst Service,1,Worst service I’ve received in my life. Non of...,vodacom,worst-service-04099ff193227e2908f31413b00ffb30...,False,[],telecommunications,,2021-12-20,1,Worst Service,Worst service I’ve received in my life. Non of...
1,3750406,2021-12-21 15:19:45,Kulani Marry-Aan,0a616360-6260-11ec-b18f-0f0735b462a0,Complaint about customer service,1,18/12/2021 i bought a router at Vodacom Bushbu...,vodacom,complaint-about-customer-service-28c9107344f3a...,False,[],telecommunications,,2021-12-21,1,Complaint about customer service,18/12/2021 i bought a router at Vodacom Bushbu...


### Data Selection

Select only reviews from `vodacom` with the `review_title` and `review_content_clean` being the only data of interest.

In [4]:
df_reviews = df_source.query('business_slug == "vodacom"')[['id', 'review_title', 'review_content_clean']].reset_index(drop=True)
df_reviews.shape

(13699, 3)

In [5]:
df_reviews.head(3)

Unnamed: 0,id,review_title,review_content_clean
0,3750417,Worst Service,Worst service I’ve received in my life. Non of...
1,3750406,Complaint about customer service,18/12/2021 i bought a router at Vodacom Bushbu...
2,3750373,VODACOM not living up to promises and just wan...,I need help with Vodacom. I have been a custo...


### Data Preparation

Create an individual text file for each review for upload to HDFS.

In [6]:
def create_files(data:pd.DataFrame, output_path:str, id_column:str='id', title_column:str='review_title', content_column:str='review_content_clean') -> None:
    """
    Create a output .txt file for each review in the dataset.

    Parameters
    ----------
    data : pandas.DataFrame
        The DataFrame containint the reviews to process.

    output_path : str
        The path where the .txt files should be stored

    id_column : str
        The name of the ID column that will be used for the filename.

    title_column : str
        The name of the column in the DataFrame containing the review title.

    content_column : str
        The name of the column in the DataFrame containing the review text.
    """
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        # get the output filename
        filename = '%s/%s.txt' % (output_path, row[id_column])

        # get the text to write to file
        output_text = '%s\n\n%s' % (row[title_column], row[content_column])

        # create the output file
        with open(filename, 'w') as f:
            f.write(output_text)

# create the output files 
sample_size = 0.6
df_sample = df_reviews.sample(frac=sample_size, random_state=3231)
create_files(df_sample, 'data/input/vodacom')

  0%|          | 0/8219 [00:00<?, ?it/s]

## Hadoop Processing

### Upload Data Files to HDFS

In [7]:
%%bash
# change to the coursework directory
cd ~/code/dsm010-2021-oct/coursework_01/

# delete existing files
hadoop fs -rm -r dsm010/vodacom-corpus

# copy the input documents
 hadoop fs -copyFromLocal data/input/vodacom dsm010/vodacom-corpus

# verify the file uploads
hadoop fs -ls dsm010/vodacom-corpus | head -n 5

Deleted dsm010/vodacom-corpus
Found 8216 items
-rw-r--r--   3 jfoul001 users        496 2022-01-01 20:48 dsm010/vodacom-corpus/3337933.txt
-rw-r--r--   3 jfoul001 users        269 2022-01-01 20:48 dsm010/vodacom-corpus/3337940.txt
-rw-r--r--   3 jfoul001 users        268 2022-01-01 20:48 dsm010/vodacom-corpus/3337941.txt
-rw-r--r--   3 jfoul001 users        578 2022-01-01 20:47 dsm010/vodacom-corpus/3337954.txt


### Convert the dataset to SequenceFiles

SequenceFiles are flat files consisting of binary key/value pairs. Each document is represented as a key-value pair. there the key is the document id and value is its content.

In [8]:
%%bash
mahout seqdirectory \
    -i dsm010/vodacom-corpus \
    -o dsm010/vodacom-corpus-seqfiles \
    -ow \
    -c UTF-8 \
    -chunk 5

MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt/hadoop/current/bin/hadoop and HADOOP_CONF_DIR=/opt/hadoop/current/etc/hadoop
MAHOUT-JOB: /opt/mahout/current/mahout-examples-0.13.0-job.jar


22/01/01 20:49:07 INFO AbstractJob: Command line arguments: {--charset=[UTF-8], --chunkSize=[5], --endPhase=[2147483647], --fileFilterClass=[org.apache.mahout.text.PrefixAdditionFilter], --input=[dsm010/vodacom-corpus], --keyPrefix=[], --method=[mapreduce], --output=[dsm010/vodacom-corpus-seqfiles], --overwrite=null, --startPhase=[0], --tempDir=[temp]}
22/01/01 20:49:08 INFO HadoopUtil: Deleting dsm010/vodacom-corpus-seqfiles
22/01/01 20:49:08 INFO deprecation: mapred.input.dir is deprecated. Instead, use mapreduce.input.fileinputformat.inputdir
22/01/01 20:49:08 INFO deprecation: mapred.compress.map.output is deprecated. Instead, use mapreduce.map.output.compress
22/01/01 20:49:08 INFO deprecation: mapred.output.dir is deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir
22/01/01 20:49:08 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at lena-master/128.86.245.64:8032
22/01/01 20:49:08 INFO JobResourceUploader: Disabling Erasure Coding for path:

### Convert sequenceFiles to sparse vector files

In [9]:
%%bash
mahout seq2sparse \
    -nv \
    -i dsm010/vodacom-corpus-seqfiles \
    -o dsm010/vodacom-corpus-vectors \
    -ow

MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt/hadoop/current/bin/hadoop and HADOOP_CONF_DIR=/opt/hadoop/current/etc/hadoop
MAHOUT-JOB: /opt/mahout/current/mahout-examples-0.13.0-job.jar


22/01/01 20:49:44 INFO SparseVectorsFromSequenceFiles: Maximum n-gram size is: 1
22/01/01 20:49:44 INFO HadoopUtil: Deleting dsm010/vodacom-corpus-vectors
22/01/01 20:49:44 INFO SparseVectorsFromSequenceFiles: Minimum LLR value: 1.0
22/01/01 20:49:44 INFO SparseVectorsFromSequenceFiles: Number of reduce tasks: 1
22/01/01 20:49:44 INFO SparseVectorsFromSequenceFiles: Tokenizing documents in dsm010/vodacom-corpus-seqfiles
22/01/01 20:49:44 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at lena-master/128.86.245.64:8032
22/01/01 20:49:45 INFO JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/jfoul001/.staging/job_1626049283275_242076
22/01/01 20:49:45 INFO FileInputFormat: Total input files to process : 2
22/01/01 20:49:45 INFO JobSubmitter: number of splits:2
22/01/01 20:49:45 INFO JobSubmitter: Submitting tokens for job: job_1626049283275_242076
22/01/01 20:49:45 INFO JobSubmitter: Executing with tokens: []
22/01/01 20:49:45 INFO Co

### Perform Clustering

Perform clustering and output data to perform hyperparameter optimization by trying various distance measures and values of K.

In [10]:
%%bash
#!/bin/bash

# range to use for k
k_start=2
k_end=20

# the path to the vectors and dictionary
path_vectors=dsm010/vodacom-corpus-vectors/tf-vectors
path_dictionary=dsm010/vodacom-corpus-vectors/dictionary.file-*

# the output base path for the clusters and the result local output path
path_hdfs_base=hdfs://lena/user/jfoul001/
path_clusters_base=dsm010/vodacom-corpus-kmeans
path_results_base=~/code/dsm010-2021-oct/coursework_01/data/output/vodacom-corpus-clusters

# the distance metric to use
distance_metrics=("org.apache.mahout.common.distance.CosineDistanceMeasure" "org.apache.mahout.common.distance.EuclideanDistanceMeasure" "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure" "org.apache.mahout.common.distance.ManhattanDistanceMeasure")

for distance_metric in "${distance_metrics[@]}"
do
  echo "--- Distance Metric: $distance_metric"

    # perform the canopy clustering
    distance_name=${distance_metric##*.}
    path_centroids=dsm010/vodacom-corpus-canopy-centroids/${distance_name}

    mahout canopy \
      -i $path_vectors \
      -ow \
      -o $path_centroids \
      -dm $distance_metric \
      -t1 0.5 \
      -t2 0.3

  for ((k = $k_start; k <= $k_end; k++))
  do
    # get k with a leading zero if required
    k_padded=$(printf %02d $k)

    # set the output path for the clusters
    path_clusters="${path_clusters_base}/${distance_name}/${k_padded}"

    echo "---- K: $k_padded -- $path_clusters"

    # perform the kmeans clustering
    mahout kmeans \
    -i $path_vectors \
    -c $path_centroids \
    -o "${path_hdfs_base}${path_clusters}" \
    -ow \
    -dm $distance_metric \
    -cl -cd 0.1 -ow -x 20 \
    -k $k

    # set the path for output
    path_final_clusters=`hadoop fs -ls -d -C "${path_clusters}/clusters-*-final"`
    path_clusterpoints="${path_clusters}/clusteredPoints"
    path_results="${path_results_base}/${distance_name}/${k_padded}.txt"

    # output the cluster results
    mahout clusterdump -dt sequencefile \
       -d $path_dictionary \
       -i $path_final_clusters  \
       -o $path_results \
       -of TEXT \
       -b 100 \
       -p $path_clusterpoints \
       -dm $distance_metric \
       -n 20 --evaluate
  done  
done

--- Distance Metric: org.apache.mahout.common.distance.CosineDistanceMeasure
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt/hadoop/current/bin/hadoop and HADOOP_CONF_DIR=/opt/hadoop/current/etc/hadoop
MAHOUT-JOB: /opt/mahout/current/mahout-examples-0.13.0-job.jar
---- K: 02 -- dsm010/vodacom-corpus-kmeans/CosineDistanceMeasure/02
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt/hadoop/current/bin/hadoop and HADOOP_CONF_DIR=/opt/hadoop/current/etc/hadoop
MAHOUT-JOB: /opt/mahout/current/mahout-examples-0.13.0-job.jar
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt/hadoop/current/bin/hadoop and HADOOP_CONF_DIR=/opt/hadoop/current/etc/hadoop
MAHOUT-JOB: /opt/mahout/current/mahout-examples-0.13.0-job.jar
---- K: 03 -- dsm010/vodacom-corpus-kmeans/CosineDistanceMeasure/03
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /opt

22/01/01 20:57:03 INFO AbstractJob: Command line arguments: {--distanceMeasure=[org.apache.mahout.common.distance.CosineDistanceMeasure], --endPhase=[2147483647], --input=[dsm010/vodacom-corpus-vectors/tf-vectors], --method=[mapreduce], --output=[dsm010/vodacom-corpus-canopy-centroids/CosineDistanceMeasure], --overwrite=null, --startPhase=[0], --t1=[0.5], --t2=[0.3], --tempDir=[temp]}
22/01/01 20:57:04 INFO HadoopUtil: Deleting dsm010/vodacom-corpus-canopy-centroids/CosineDistanceMeasure
22/01/01 20:57:04 INFO CanopyDriver: Build Clusters Input: dsm010/vodacom-corpus-vectors/tf-vectors Out: dsm010/vodacom-corpus-canopy-centroids/CosineDistanceMeasure Measure: org.apache.mahout.common.distance.CosineDistanceMeasure@4ade6987 t1: 0.5 t2: 0.3
22/01/01 20:57:04 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at lena-master/128.86.245.64:8032
22/01/01 20:57:04 INFO JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/jfoul001/.staging/job_16

## Analysis

### Data Extraction

Extract data for analysis from the Mahout cluster dumps. 

> ℹ️ Although Mahout includes the option to produce data in JSON format which would have been easier to process, when the JSON format is used the distances to data points are excluded.

In [2]:
# configure logging for debugging
logging.basicConfig(filename='cw01_q01_analysis.log', level=logging.INFO)

# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)

# add the handler to the root logger
logging.getLogger('').addHandler(console)

In [23]:
class MahoutDataExtract():
    """
    This class is used to extract data from files produced with `mahout clusterdump`.
    """
    def process_file(self, file_path:str) -> tuple:
        """
        Process the file at specified path and extract the relevant data.

        Returns
        -------
        output_data : pandas.DataFrame
            The DataFrame will contain the following columns: cluster_identifier, point_distance

        density : tuple
            inter_cluster_density : float
                The Inter-Cluster Density
            intra_cluster_density : float
                Intra-Cluster Density
        """
        logging.info('- Processing: %s' % file_path)

        raw_data = []
        current_cluster = None

        # load the values from the text file
        with open(file_path) as file:
            while (line := file.readline().rstrip()):
                if cluster_identifier := re.search(r'^:{"identifier":"(.+?)"', line): # start of a new cluster
                    current_cluster = cluster_identifier.group(1)
                    logging.info('-- Cluster: %s' % current_cluster)
                elif point_distance := re.match(r'.+? : \[distance=(.+?)\]', line): # handle a point in the cluster
                    point_distance = float(point_distance.group(1))
                    #logging.info('--- Point: %s' % point_distance)

                    # add the point to the raw data
                    raw_data.append({
                        'cluster_identifier': current_cluster,
                        'point_distance': point_distance
                    })

        # create the output dataframe
        output_data = pd.DataFrame(raw_data)

        # return the output values
        return output_data

# test process_file
extractor = MahoutDataExtract()
#extractor.process_file('data/output/british-fiction-corpus-clusters.txt')
extractor.process_file('data/output/vodacom-corpus-clusters/CosineDistanceMeasure/03.txt')

- Processing: data/output/vodacom-corpus-clusters/CosineDistanceMeasure/03.txt
-- Cluster: VL-8029
-- Cluster: VL-1269
-- Cluster: VL-2068


Unnamed: 0,cluster_identifier,point_distance
0,VL-8029,0.612876
1,VL-8029,0.616223
2,VL-8029,0.661522
3,VL-8029,0.473760
4,VL-8029,0.474494
...,...,...
8210,VL-2068,0.672163
8211,VL-2068,0.247968
8212,VL-2068,0.190243
8213,VL-2068,0.353478
