In [2]:
#### INSTALLATION OF JAVA/SPARK
!sudo apt update
!sudo apt install openjdk-17-jdk -y
#!curl -JLO 'https://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz'
!tar xvf spark-3.3.1-bin-hadoop3.tgz
!mv spark-3.3.1-bin-hadoop3 /opt/spark

#### DIRECTORY SETTING FOR SPARK 
import os
os.environ["JAVA_HOME"] = "/usr"
os.environ["SPARK_HOME"] = "/opt/spark/spark-3.3.1-bin-hadoop3"

#### INSTALLING LIBRARIES
!pip install findspark
!pip install pyspark
!pip install pytorch-ignite transformers

Hit:1 https://deb.nodesource.com/node_16.x focal InRelease
Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:3 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease           [0m
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease   [0m[33m             
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB][33m[33m[33m[33m
Fetched 336 kB in 1s (439 kB/s)  [0m[33m[33m
Reading package lists... Done
Building dependency tree       
Reading state information... Done
103 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
openjdk-17-jdk is already the newest version (17.0.5+8-2ubuntu1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 103 not upgraded.
spark-3.3.1-bin-hadoop3/
spark-3.3.1-bin-hadoop3/LICENSE
spark-3.3.1-bin-hadoop3/NOTICE
sp

### Load Dataset

In [3]:
# !pip install gdown
# !mkdir data
# !cd data
# !gdown 1QhtF1UAbVMJyAcqVJDucYwmib0hHCP1A
# !mv 'Dataset Yelp.zip' data.zip
# !unzip data.zip
# !mv  -v /notebooks/data/Dataset\ Yelp/* /notebooks/data/
# !rm -rf /notebooks/data/Dataset\ Yelp

In [None]:
#### SPARK
import findspark
findspark.init();
from pyspark.sql import SparkSession
import pyspark.pandas as ps

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.3")\
    .getOrCreate();

spark.sparkContext.setLogLevel("OFF");

ps.set_option('compute.ops_on_diff_frames', True);

Loading the dataset

In [148]:
reviews = ps.read_json("./data/review.json", lines=True)

                                                                                

Functions for limiting number of words to the model

In [155]:
def get_word_count(str):
    if str is None:
        return 0
    ls = str.split()
    return len(ls)

def chop_500(str):
    ls = str.split()
    return ' '.join(ls[:380])

In [171]:
reviews['word_count'] = reviews['text'].apply(get_word_count)
reviews['text'] = reviews['text'].apply(chop_500)
reviews['word_count_check'] = reviews['text'].apply(get_word_count)
reviews.loc[reviews['word_count_check']>350,:].shape

                                                                                

(205086, 11)

In [167]:
reviews.describe()

                                                                                

Unnamed: 0,cool,funny,stars,useful,word_count,word_count_check
count,6990280.0,6990280.0,6990280.0,6990280.0,6990280.0,6990280.0
mean,0.4986175,0.3265596,3.748584,1.184609,103.4024,101.6503
std,2.17246,1.688729,1.478705,3.253767,90.34197,83.75869
min,-1.0,-1.0,1.0,-1.0,1.0,1.0
25%,0.0,0.0,3.0,0.0,42.0,42.0
50%,0.0,0.0,4.0,0.0,75.0,75.0
75%,0.0,0.0,5.0,1.0,133.0,133.0
max,404.0,792.0,5.0,1182.0,496.0,380.0


In [215]:
from transformers import pipeline
# if you have GPU access, you can set the device parameter to 0 to use the GPU, which will speed up model performance.
classifier = pipeline("sentiment-analysis", model="textattack/albert-base-v2-yelp-polarity")
#classifier = pipeline("sentiment-analysis", model="Seethal/sentiment_analysis_generic_dataset")

In [216]:
AllData = spark.read \
      .option("header", True) \
      .json("./data/review.json")

AllData = AllData.select("text", "review_id")

                                                                                

In [217]:
df = AllData.limit(7000)

In [218]:
from pyspark.sql.types import StructType,StructField,FloatType,Row, StringType
from pyspark.sql import functions as F

def get_label(x):
    return Row('Out1', 'Out2')(classifier(x, truncation=True).pop(0).get('label'), classifier(x,truncation=True).pop(0).get('score'))

schema = StructType([
    StructField("Label", StringType(), False),
    StructField("Score", FloatType(), False)])

example_udf = F.UserDefinedFunction(get_label, schema)

In [219]:
newDF = df.withColumn("Output", example_udf(df["text"]))

In [220]:
newDF = newDF.select("review_id", "Output.*")

In [221]:
newDF.show(truncate=15)

[Stage 465:>                                                        (0 + 1) / 1]

+---------------+-------+----------+
|      review_id|  Label|     Score|
+---------------+-------+----------+
|KU_O5udG6zpx...|LABEL_0|0.98213875|
|BiTunyQ73aT9...|LABEL_1| 0.9999385|
|saUsX_uimxRl...|LABEL_1|0.99986064|
|AqPFMleE6RsU...|LABEL_1|0.99991775|
|Sx8TMOWLNuJB...|LABEL_1|0.97646654|
|JrIxlS1TzJ-i...|LABEL_0| 0.9991316|
|6AxgBCNX_PNT...|LABEL_1|0.99992704|
|_ZeMknuYdlQc...|LABEL_1|0.99986184|
|ZKvDG2sBvHVd...|LABEL_0| 0.9952799|
|pUycOfUwM8vq...|LABEL_0| 0.9977696|
|rGQRf8UafX7O...|LABEL_1|0.99945754|
|l3Wk_mvAog6X...|LABEL_1| 0.9988385|
|XW_LfMv0fV21...|LABEL_1| 0.9995548|
|8JFGBuHMoiND...|LABEL_1|0.99915755|
|UBp0zWyH60Hm...|LABEL_1|0.99959487|
|OAhBYw8IQ6wl...|LABEL_1|0.99974436|
|oyaMhzBSwfGg...|LABEL_1|0.99964035|
|LnGZB0fjfgeV...|LABEL_1|0.99991727|
|u2vzZaOqJ2fe...|LABEL_1| 0.9991479|
|Xs8Z8lmKkosq...|LABEL_1| 0.9999299|
+---------------+-------+----------+
only showing top 20 rows



                                                                                

In [None]:
newDF.write.csv("/data/datacsv.csv", header=True)

In [192]:
newDF = newDF.select("review_id", "Output.*")
newDF.show(truncate=15)

[Stage 448:>                                                        (0 + 1) / 1]

+---------------+-------+----------+
|      review_id|  Label|     Score|
+---------------+-------+----------+
|KU_O5udG6zpx...|LABEL_0|0.98213875|
|BiTunyQ73aT9...|LABEL_1| 0.9999385|
|saUsX_uimxRl...|LABEL_1|0.99986064|
|AqPFMleE6RsU...|LABEL_1|0.99991775|
|Sx8TMOWLNuJB...|LABEL_1|0.97646654|
+---------------+-------+----------+



                                                                                

In [188]:
reviews_tiny = ps.DataFrame(reviews.loc[0:2, "text"])

                                                                                

In [213]:
classifier('I hate this place, food is awfull', truncation=True)[0]['score']


TypeError: classifier() got an unexpected keyword argument 'truncation'

In [176]:
def classify(x):
    label = classifier(x, truncation=True)[0]['label']
    score = classifier(x, truncation=True)[0]['score']
    return label, score

In [None]:
import pyspark.pandas as ps

data = {"col_1": [1,2,3], "col_2": [4,5,6]}
df = ps.DataFrame(data)

median_series = df[["col_1","col_2"]].apply(lambda x: x.median(), axis=1)
median_series.name = "median"

df = ps.merge(df, median_series, left_index=True, right_index=True, how='left')

In [189]:
reviews_tiny['label'],reviews_tiny['score'] = reviews_tiny['text'].apply(classify)

                                                                                

PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.

In [175]:
reviews_tiny.loc[reviews_tiny['todo'] == None, :].shape

(0, 2)

In [144]:
# def splitter(str):
#     ls, lx = str.split()[0], str.split()[1]
#     return ls,lx

In [178]:
reviews_tiny['label'] = reviews_tiny['todo'].apply(lambda x: str(x).split(',')[0])

                                                                                

In [182]:
str(['LABEL_0','0.9821387529373169']).split(',')[0]

"['LABEL_0'"

In [179]:
reviews_tiny.head()

                                                                                

Unnamed: 0,text,todo,label
0,"If you decide to eat here, just be aware it is...","[LABEL_0, 0.9821387529373169]",['LABEL_0' '0.9821387529373169']
1,I've taken a lot of spin classes over the year...,"[LABEL_1, 0.9999384880065918]",['LABEL_1' '0.9999384880065918']
2,Family diner. Had the buffet. Eclectic assortm...,"[LABEL_1, 0.9998606443405151]",['LABEL_1' '0.9998606443405151']
3,"Wow! Yummy, different, delicious. Our favorite...","[LABEL_1, 0.99991774559021]",['LABEL_1' '0.99991774559021']
4,Cute interior and owner (?) gave us tour of up...,"[LABEL_1, 0.9764665365219116]",['LABEL_1' '0.9764665365219116']


In [None]:
#### CHECKING CUDA SUPPORT AND VERSION
#import torch
#torch.cuda.is_available()
#!nvcc --version