## Task -1 Exploratory Data Analysis

In [1]:
"""
Installing Spark with its dependencies
Installing Spark
Install Dependencies:

Java 8
Apache Spark with hadoop and
Findspark (used to locate the spark in the system)
"""

#these operations could be done on Google colab or needed to run under linuxOS.
 

#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
#!tar xf spark-3.1.1-bin-hadoop3.2.tgz
#!pip install -q findspark

'\nInstalling Spark with its dependencies\nInstalling Spark\nInstall Dependencies:\n\nJava 8\nApache Spark with hadoop and\nFindspark (used to locate the spark in the system)\n'

In [2]:
#Set Environment Variables:

import os

current_directory = os.getcwd()


os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = os.path.join(current_directory,"spark-3.1.1-bin-hadoop3.2")

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

24/01/23 10:03:47 WARN Utils: Your hostname, codespaces-7047d6 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/01/23 10:03:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/01/23 10:03:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
#will be used sqlite3 to be able to reach .db file

import sqlite3

con = sqlite3.connect('Datasets/movielens-small.db')
cur = con.cursor()

In [5]:
#For SQLite JDBC driver, it can be downloaded via:

!curl -O https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7125k  100 7125k    0     0  53.5M      0 --:--:-- --:--:-- --:--:-- 53.1M


In [6]:
# Write a SQL query to create a dataframe with including userid, movieid, genre and rating

import csv
with open("movielens.csv", "w") as csvFile:
    fieldnames = ['userId', 'movieId', 'genre', 'rating']
    writer = csv.DictWriter(csvFile, fieldnames=fieldnames)
    writer.writeheader()

    for row in cur.execute("""SELECT ratings.userId, movies.movieId, genres, rating 
                              FROM (((movies 
                                      INNER JOIN links ON movies.movieid = links.movieid) 
                                      LEFT JOIN ratings ON movies.movieId = ratings.movieId) 
                                      LEFT JOIN tags ON movies.movieid = tags.movieid)"""):
 
        userId = row[0]
        movieId = row[1]
        genre = row[2]
        rating = row[3]

        writer.writerow({'userId': userId, 
                        'movieId':movieId,
                        'genre': genre,
                        'rating':rating}
        )

movielens_small_df = spark.read.csv("movielens.csv", header=True)

print("number of rows of our dataframe:", movielens_small_df.count())
print("\n")

movielens_small_df

                                                                                

number of rows of our dataframe: 249641




userId,movieId,genre,rating
7,1,Adventure|Animati...,5.0
7,1,Adventure|Animati...,5.0
7,1,Adventure|Animati...,5.0
10,1,Adventure|Animati...,4.0
10,1,Adventure|Animati...,4.0
10,1,Adventure|Animati...,4.0
13,1,Adventure|Animati...,4.5
13,1,Adventure|Animati...,4.5
13,1,Adventure|Animati...,4.5
16,1,Adventure|Animati...,5.0


In [7]:
# Count ratings for each movie, and list top 5 movies with the highest value

movie_rating_count = []

for row in cur.execute("""SELECT movieid, title,COUNT(rating) 
                          FROM (SELECT ratings.userId, movies.movieId, title,genres, rating 
                                FROM (((movies 
                                        INNER JOIN links ON movies.movieid = links.movieid)
                                        LEFT JOIN ratings ON movies.movieId = ratings.movieId)
                                        LEFT JOIN tags ON movies.movieid = tags.movieid))                                                    
                          GROUP BY movieid
                          ORDER BY count(rating) DESC
                          LIMIT 5
                        """):
  movie_rating_count.append(row)

schema = ["movieId", "title", "rating count"]
 
# calling function to create dataframe
df = spark.createDataFrame(movie_rating_count, schema)

df.show(truncate=False)
  

                                                                                

+-------+-------------------------+------------+
|movieId|title                    |rating count|
+-------+-------------------------+------------+
|318    |Shawshank Redemption, The|5904        |
|2571   |Matrix, The              |5300        |
|296    |Pulp Fiction             |4578        |
|2762   |Sixth Sense, The         |4080        |
|47     |Seven (a.k.a. Se7en)     |4047        |
+-------+-------------------------+------------+



In [8]:
# Find and list top 5 most rated genres

rated_genres = []

for row in cur.execute("""SELECT genres, COUNT(rating) 
                          FROM (SELECT ratings.userId, movies.movieId, genres, rating 
                                FROM (((movies 
                                        INNER JOIN links ON movies.movieid = links.movieid) 
                                        LEFT JOIN ratings ON movies.movieId = ratings.movieId) 
                                        LEFT JOIN tags ON movies.movieid = tags.movieid))
                          GROUP BY genres
                          ORDER BY COUNT(rating) DESC
                          LIMIT 5
                        """):
  rated_genres.append(row)

schema = ["genres", "rating count"]
 
# calling function to create dataframe
df = spark.createDataFrame(rated_genres, schema)

df.show(truncate=False)


+--------------------------------+------------+
|genres                          |rating count|
+--------------------------------+------------+
|Comedy                          |10162       |
|Crime|Drama                     |10107       |
|Comedy|Drama                    |7949        |
|Drama                           |7846        |
|Action|Adventure|Sci-Fi|Thriller|7182        |
+--------------------------------+------------+



In [9]:
#Find and list top 5 most rated tags

rated_tags = []

for row in cur.execute("""SELECT tag, COUNT(rating) 
                          FROM (SELECT ratings.userId, movies.movieId, genres, tag, rating 
                                FROM (((movies 
                                        INNER JOIN links ON movies.movieid = links.movieid) 
                                        LEFT JOIN ratings ON movies.movieId = ratings.movieId) 
                                        LEFT JOIN tags ON movies.movieid = tags.movieid))
                          WHERE tag is NOT NULL
                          GROUP BY tag
                          ORDER BY COUNT(rating) DESC
                          LIMIT 5
                        """):
  rated_tags.append(row)

schema = ["tags", "tags count"]
 
# calling function to create dataframe
df = spark.createDataFrame(rated_tags, schema)

df.show(truncate=False)



+------------+----------+
|tags        |tags count|
+------------+----------+
|drama       |3542      |
|sci-fi      |3035      |
|twist ending|2998      |
|psychology  |2672      |
|crime       |2570      |
+------------+----------+



In [10]:
# By using timestamp from ratings table, provide top 5 most frequent users within a week

weekly_activity = []

for row in cur.execute("""SELECT userid, strftime('%Y-%W', datetime(timestamp, 'unixepoch')) AS week_year, COUNT(strftime('%Y-%W', datetime(timestamp, 'unixepoch'))) AS weekly_activity 
                          FROM ratings
                          GROUP BY userid, week_year 
                          ORDER BY weekly_activity DESC
                          LIMIT 5
                        """):
  weekly_activity.append(row)

schema = ["userId", "week of the year", "weekly activity of user"]
 
# calling function to create dataframe
df = spark.createDataFrame(weekly_activity, schema)

df.show(truncate=False)



+------+----------------+-----------------------+
|userId|week of the year|weekly activity of user|
+------+----------------+-----------------------+
|384   |2008-42         |1107                   |
|176   |2015-03         |972                    |
|614   |2014-27         |830                    |
|330   |2008-33         |711                    |
|202   |2003-50         |676                    |
+------+----------------+-----------------------+



In [11]:
# Calculate average ratings for each genre, and plot average ratings of top 10 genres with descending order

genres_avgrating=[]

for row in cur.execute("""SELECT genres, round(avg(rating), 2) avg_rating 
                          FROM (SELECT ratings.userId, movies.movieId, genres, rating 
                                FROM (((movies 
                                        INNER JOIN links ON movies.movieid = links.movieid) 
                                        LEFT JOIN ratings ON movies.movieId = ratings.movieId) 
                                        LEFT JOIN tags ON movies.movieid = tags.movieid))
                          GROUP BY genres
                          ORDER BY avg_rating DESC
                          LIMIT 10"""):
  genres_avgrating.append(row)

schema = ["genres", "avg_rating"]
 
# calling function to create dataframe
df = spark.createDataFrame(genres_avgrating, schema)

df.show(truncate=False)



+--------------------------------------------------------+----------+
|genres                                                  |avg_rating|
+--------------------------------------------------------+----------+
|Crime|Documentary|War                                   |5.0       |
|Animation|Documentary                                   |5.0       |
|Adventure|Fantasy|Mystery                               |5.0       |
|Action|Adventure|Animation|Comedy|Fantasy|Mystery|Sci-Fi|5.0       |
|Crime|Horror|Mystery                                    |4.75      |
|Drama|Horror|War                                        |4.5       |
|Comedy|Crime|Western                                    |4.5       |
|Children|Horror|Mystery|Thriller                        |4.5       |
|Animation|Comedy|Horror|IMAX                            |4.5       |
|Animation|Children|Drama|Fantasy|IMAX                   |4.5       |
+--------------------------------------------------------+----------+



## TASK 2 - Recommender Design

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [13]:
spark = SparkSession.builder.appName("movieRecommendation").getOrCreate() 

In [14]:
# using movie rating data to probide implicit feature using ALS(Alternate Least Squares)

movie_rating=[]

for row in cur.execute("""SELECT userId, movieId, rating 
                          FROM ratings
                                      """):
  movie_rating.append(row)

schema = ["userId", "movieId", "rating"]
 
# calling function to create dataframe
movie_rating_df = spark.createDataFrame(movie_rating, schema)

movie_rating_df




userId,movieId,rating
1,6,2.0
1,22,3.0
1,32,2.0
1,50,5.0
1,110,4.0
1,164,3.0
1,198,3.0
1,260,5.0
1,296,4.0
1,303,3.0


In [15]:
# description of created schema 
movie_rating_df.printSchema()

root
 |-- userId: long (nullable = true)
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)



In [16]:
#splitting dataset to train the model as 80% for train and remaining for test data.
(train, test) = movie_rating_df.randomSplit([0.8, 0.2], seed=87)

In [17]:
# 1st Recommender model - Alternating Least Square (ALS) Matrix Factorization in Collaborative Filtering on rating (as actual values) 

als = ALS(rank=10, maxIter=15, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

model = als.fit(train)

pred = model.transform(test)

pred = pred.selectExpr("userId as userId","movieId as movieId","rating as rating","prediction as implicit")

pred.show(truncate=False)

24/01/23 10:04:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/01/23 10:04:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/01/23 10:04:11 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/01/23 10:04:11 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK

+------+-------+------+---------+
|userId|movieId|rating|implicit |
+------+-------+------+---------+
|511   |148    |3.0   |2.2868726|
|31    |471    |4.5   |4.0591545|
|159   |471    |4.0   |3.7307599|
|40    |471    |4.0   |3.8190136|
|489   |471    |5.0   |4.4431553|
|505   |471    |5.0   |5.138457 |
|114   |471    |4.0   |4.3663287|
|100   |471    |4.0   |4.5732665|
|450   |471    |2.0   |1.5212581|
|7     |471    |4.0   |4.049525 |
|214   |471    |3.0   |3.9719532|
|221   |471    |5.0   |6.061598 |
|284   |471    |1.0   |2.5042088|
|499   |471    |3.0   |4.1740155|
|153   |471    |3.0   |3.0022483|
|677   |833    |2.0   |2.3963344|
|199   |833    |5.0   |2.4504826|
|500   |1088   |3.0   |2.7542734|
|339   |1088   |4.5   |3.2113464|
|491   |1088   |2.0   |5.1001573|
+------+-------+------+---------+
only showing top 20 rows



                                                                                

In [18]:
#calculating RMSE and MAE to evaluate performance of the models. 

eval_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="implicit")
eval_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="implicit")


rmse = eval_rmse.evaluate(pred)
mae = eval_mae.evaluate(pred)


print("RMSE of ALS:", rmse)
print("MAE of ALS:", mae)






RMSE of ALS: 1.1989328763509657
MAE of ALS: 0.8902257735617987


                                                                                

In [19]:
# 2nd Recommender model - Alternating Least Square (ALS) Matrix Factorization in Collaborative Filtering on designed implicit feedback values 


(train_implicit, test_implicit) = pred.randomSplit([0.8, 0.2], seed=87)

als_implicit = ALS(rank=10, maxIter=15, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="implicit", coldStartStrategy="drop")

model_implicit = als_implicit.fit(train_implicit)

pred_implicit = model_implicit.transform(test_implicit)

pred_implicit.show(truncate=False)



+------+-------+------+---------+----------+
|userId|movieId|rating|implicit |prediction|
+------+-------+------+---------+----------+
|159   |471    |4.0   |3.7307599|3.2100675 |
|114   |471    |4.0   |4.3663287|4.9844255 |
|214   |471    |3.0   |3.9719532|3.204597  |
|221   |471    |5.0   |6.061598 |3.1016166 |
|284   |471    |1.0   |2.5042088|3.074966  |
|339   |1088   |4.5   |3.2113464|4.6573715 |
|581   |1580   |3.5   |4.4716005|2.9713326 |
|48    |1580   |5.0   |4.43153  |3.2103605 |
|37    |1580   |4.5   |4.1130466|3.913432  |
|523   |1580   |3.5   |3.7270386|3.1889482 |
|627   |1580   |3.0   |3.232979 |2.4179761 |
|529   |1580   |3.5   |3.7929277|3.6058168 |
|384   |1591   |2.5   |3.72709  |2.0936759 |
|246   |1591   |3.5   |3.660907 |1.5329692 |
|384   |1645   |3.5   |3.1507044|2.5546923 |
|455   |1645   |2.5   |3.1708877|3.338704  |
|8     |1645   |3.0   |3.7471874|3.0631862 |
|529   |1645   |3.0   |2.9021955|3.1574273 |
|416   |2142   |3.0   |4.711654 |2.63531   |
|677   |21

                                                                                

In [20]:
eval_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
eval_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")


rmse = eval_rmse.evaluate(pred_implicit)
mae = eval_mae.evaluate(pred_implicit)


print("RMSE of ALS_Implicit Feedback:", rmse)
print("MAE of ALS_Implicit Feedback:", mae)



RMSE of ALS_Implicit Feedback: 1.7223986976021965
MAE of ALS_Implicit Feedback: 1.3463243492090076


                                                                                

When compared these two models, the 1st model ( ALS on rating ) shows better performance compared to 2nd model (ALS on implicit feedback) according to error metrics such as Root Mean Square Error (RMSE) and Mean Absolute Error (MAE). 

## Task – 3 Text Analysis

In [21]:
spark = SparkSession.builder.appName("textAnalysis").getOrCreate()

In [22]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf


In [23]:
# download the data in Dataset folder.



!curl -o Datasets/aclImdb_v1.tar.gz https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  5738k      0  0:00:14  0:00:14 --:--:-- 13.2MM    0  144k    0     0  88722      0  0:15:48  0:00:01  0:15:47 88668


In [24]:
#Extract the tar file as dataset

import tarfile

def tarfile_extract(tar_file, output_dir=os.getcwd()+'/Datasets'):
    tar = tarfile.open(tar_file, 'r:gz')
    total_files = sum(1 for _ in tar)
    tar.extractall(output_dir, members=extract_progress(tar, total_files))
    tar.close()

def extract_progress(tar, total_files):
    for member in tar:
        yield member
        total_files -= 1
        print(f"Remaining files: {total_files}", end='\r')
    print("\nExtraction completed.")

wd = os.getcwd()
tarfile_extract(wd+'/Datasets/aclImdb_v1.tar.gz')


Remaining files: 000000
Extraction completed.


In [25]:

#using 'alldata' list to store all the files in the directories
alldata=[]


#collecting data in train/pos folder
for fname in os.listdir(wd+'/Datasets/aclImdb/train/pos'):
    with open(os.path.join(wd+'/Datasets/aclImdb/train/pos', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'train','pos'))

#collecting data in train/neg folder
for fname in os.listdir(wd+'/Datasets/aclImdb/train/neg'):
    with open(os.path.join(wd+'/Datasets/aclImdb/train/neg', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'train','neg'))
#collecting data in test/pos folder
for fname in os.listdir(wd+'/Datasets/aclImdb/test/pos'):
    with open(os.path.join(wd+'/Datasets/aclImdb/test/pos', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'test','pos'))
#collecting data in test/neg folder
for fname in os.listdir(wd+'/Datasets/aclImdb/test/neg'):
    with open(os.path.join(wd+'/Datasets/aclImdb/test/neg', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'test','neg'))




In [26]:

from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

appName = "list to Spark Data Frame"
master = "local"

# Create Spark session
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

# List
data = alldata

# Create a schema for the dataframe
schema = StructType([
    StructField('content', StringType(), True),
    StructField('label', StringType(), True),
    StructField('sentiemtn', StringType(), True)
])

# Convert list to RDD
rdd = spark.sparkContext.parallelize(data)

# Create data frame
df = spark.createDataFrame(rdd,schema)
print(df.schema)
df.show()

StructType(List(StructField(content,StringType,true),StructField(label,StringType,true),StructField(sentiemtn,StringType,true)))
+--------------------+-----+---------+
|             content|label|sentiemtn|
+--------------------+-----+---------+
|Man, this is a ha...|train|      pos|
|The Color Purple ...|train|      pos|
|Randolph Scott is...|train|      pos|
|High energy Raoul...|train|      pos|
|One of the great ...|train|      pos|
|Although I'm grat...|train|      pos|
|It is hard to des...|train|      pos|
|- Having grown ti...|train|      pos|
|This movie is fun...|train|      pos|
|It was considered...|train|      pos|
|Other commentator...|train|      pos|
|I saw this movie ...|train|      pos|
|So i consider mys...|train|      pos|
|My mother took me...|train|      pos|
|After 21 movies a...|train|      pos|
|I have a six mont...|train|      pos|
|...On stage, TV o...|train|      pos|
|I had to see this...|train|      pos|
|Although at one p...|train|      pos|
|Hot Millions

24/01/23 10:06:38 WARN TaskSetManager: Stage 705 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


In [27]:
df.count()

24/01/23 10:06:38 WARN TaskSetManager: Stage 706 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

50000

In [28]:
# Schema of created Spark Dataframe

df.printSchema()

root
 |-- content: string (nullable = true)
 |-- label: string (nullable = true)
 |-- sentiemtn: string (nullable = true)



###Tokenization

In [29]:
#using RegexTokenizer for tokenizing contents

tokenizer = RegexTokenizer(inputCol="content", outputCol="tokenized_content", pattern="\\W") # used regexp to determine pattern as 'not word'

countTokens = udf(lambda w: len(w), IntegerType())

tokenized = tokenizer.transform(df)

tokenized.show()

24/01/23 10:06:40 WARN TaskSetManager: Stage 708 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----+---------+--------------------+
|             content|label|sentiemtn|   tokenized_content|
+--------------------+-----+---------+--------------------+
|Man, this is a ha...|train|      pos|[man, this, is, a...|
|The Color Purple ...|train|      pos|[the, color, purp...|
|Randolph Scott is...|train|      pos|[randolph, scott,...|
|High energy Raoul...|train|      pos|[high, energy, ra...|
|One of the great ...|train|      pos|[one, of, the, gr...|
|Although I'm grat...|train|      pos|[although, i, m, ...|
|It is hard to des...|train|      pos|[it, is, hard, to...|
|- Having grown ti...|train|      pos|[having, grown, t...|
|This movie is fun...|train|      pos|[this, movie, is,...|
|It was considered...|train|      pos|[it, was, conside...|
|Other commentator...|train|      pos|[other, commentat...|
|I saw this movie ...|train|      pos|[i, saw, this, mo...|
|So i consider mys...|train|      pos|[so, i, consider,...|
|My mother took me...|train|      pos|[m

### Removing Stop Words

In [30]:
type(tokenized)

pyspark.sql.dataframe.DataFrame

In [31]:
from pyspark.ml.feature import StopWordsRemover

tokenized.show()

+--------------------+-----+---------+--------------------+
|             content|label|sentiemtn|   tokenized_content|
+--------------------+-----+---------+--------------------+
|Man, this is a ha...|train|      pos|[man, this, is, a...|
|The Color Purple ...|train|      pos|[the, color, purp...|
|Randolph Scott is...|train|      pos|[randolph, scott,...|
|High energy Raoul...|train|      pos|[high, energy, ra...|
|One of the great ...|train|      pos|[one, of, the, gr...|
|Although I'm grat...|train|      pos|[although, i, m, ...|
|It is hard to des...|train|      pos|[it, is, hard, to...|
|- Having grown ti...|train|      pos|[having, grown, t...|
|This movie is fun...|train|      pos|[this, movie, is,...|
|It was considered...|train|      pos|[it, was, conside...|
|Other commentator...|train|      pos|[other, commentat...|
|I saw this movie ...|train|      pos|[i, saw, this, mo...|
|So i consider mys...|train|      pos|[so, i, consider,...|
|My mother took me...|train|      pos|[m

24/01/23 10:06:40 WARN TaskSetManager: Stage 709 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


In [32]:
df_tokenized = tokenized.select("tokenized_content").withColumn("tokenCount", countTokens(col("tokenized_content")))

In [33]:
type(df_tokenized)

pyspark.sql.dataframe.DataFrame

In [34]:
# SWR -> stop word remover
SWR  = StopWordsRemover (inputCol='tokenized_content', outputCol='SWRed')


#See the result of removal operation
SWR.transform(df_tokenized).select('SWRed').show(truncate=False)

24/01/23 10:06:41 WARN TaskSetManager: Stage 710 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------