## Data Processing

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
EXE_MEMORY="2G"
DRIVER_MEMORY="8G"
spark = SparkSession.builder.appName("AWS").config("spark.executor.memory", EXE_MEMORY).config("spark.executor.cores", "3").config("spark.driver.memory", DRIVER_MEMORY).getOrCreate()

In [3]:
json_rdd = spark.read.json('C:/Users/salon/Documents/project/All_Amazon_Review.json')

In [7]:
dm = json_rdd.select('overall','reviewText')

## Giving ID as idx to each row as reviewerID is not unique

In [8]:
from pyspark.sql.window import Window as W
from pyspark.sql import functions as F
dm = dm.withColumn("idx", F.monotonically_increasing_id())
windowSpec = W.orderBy("idx")
dm.withColumn("idx", F.row_number().over(windowSpec)).show(2)

+-------+--------------------+---+
|overall|          reviewText|idx|
+-------+--------------------+---+
|    1.0|Alexa is not able...|  1|
|    4.0|Alexa works great...|  2|
+-------+--------------------+---+
only showing top 2 rows



## Removing Null Values in reviewText

In [24]:
df=rp_data.where(dm.reviewText.isNotNull())

## Removing Punctuations 

For cleaning text, in this we removed punctuations along with trailing and leading spaces. Also lower cased all the alphabets.

In [25]:
from pyspark.sql.functions import regexp_replace, trim, col, lower
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return trim(lower(regexp_replace(column, '[^\sa-zA-Z0-9]', ''))).alias('reviewText')

rp_data=dm.select("IDX", "overall", (removePunctuation(col('reviewText'))))

## Data processing

Here we tokenized text, removed all stop words given in "StopWordRemover"(list of words is given below). After this we lemmatized the text before stemming and atlast we removed the words of size less or equal than 3.

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import *
from nltk import WordNetLemmatizer
import nltk
nltk.download('wordnet')

# Tokenize text
tokenizer = Tokenizer(inputCol='reviewText', outputCol='words_token')
df_words_token = tokenizer.transform(rp_data).select('idx','overall','words_token')

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_words_no_stopw = remover.transform(df_words_token).select('idx', 'overall','words_clean')

#lemmatization
lemm=WordNetLemmatizer()
lemm_udf=udf(lambda tokens:[lemm.lemmatize(token) for token in tokens], ArrayType(StringType()))
df_lemm = df_words_no_stopw.withColumn("lemmi", lemm_udf("words_clean")).select('IDX',"overall", 'lemmi')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_lemm.withColumn("words_stemmed", stemmer_udf("lemmi")).select('IDX',"overall",'words_stemmed')


# Filter length word > 3
filter_length_udf = udf(lambda row: [x for x in row if len(x) > 3], ArrayType(StringType()))
df_final_words = df_stemmed.withColumn('words', filter_length_udf(col('words_stemmed'))).select('IDX',"overall", 'words')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\salon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


## Final Dataframe

Here is final dataframe after cleaning and it is showing just above 10 values.

In [38]:
df_final_words.show(10, False)

+---+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|IDX|overall|words                                                                                                                                                                                                                                                                                                                                                                                                                                                          

## List of Stop Words

In [55]:
from pyspark.ml.feature import StopWordsRemover
# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords() # Display default list
print(stopwords[:200])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no