## Task -1 Exploratory Data Analysis

In [1]:
"""
Installing Spark with its dependencies
Installing Spark
Install Dependencies:

Java 8
Apache Spark with hadoop and
Findspark (used to locate the spark in the system)
"""

!sudo ./install_spark.py

Running command: sudo apt-get update -qq
Running command: sudo apt install -y openjdk-8-jdk-headless
Reading package lists...

Building dependency tree...
Reading state information...

openjdk-8-jdk-headless is already the newest version (8u392-ga-1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Running command: sudo wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
Running command: sudo tar xf spark-3.1.1-bin-hadoop3.2.tgz
Running command: sudo pip install -q findspark


In [2]:
#Set Environment Variables:

import os

current_directory = os.getcwd()


os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = os.path.join(current_directory,"spark-3.1.1-bin-hadoop3.2")

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

#Create a SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("MovieLens") \
    .config("spark.jars", "spark-3.1.1-bin-hadoop3.2/jars/sqlite-jdbc-3.34.0.jar") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

24/01/23 14:59:20 WARN Utils: Your hostname, codespaces-7047d6 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/01/23 14:59:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/01/23 14:59:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
#will be used sqlite3 to be able to reach .db file

import sqlite3

con = sqlite3.connect('Datasets/movielens-small.db')
cur = con.cursor()

In [5]:
# Get the field information of each table in the database

# Define a function to print table information in a readable way
def print_table_info(table_name, table_info):
    print(f"\nTable: {table_name}")
    for row in table_info:
        print(row[0], row[1], row[2],sep=": ")

# Get and print information for the 'movies' table
cur.execute("PRAGMA table_info(movies);")
movies_info = cur.fetchall()
print_table_info("movies", movies_info)

# Get and print information for the 'ratings' table
cur.execute("PRAGMA table_info(ratings);")
ratings_info = cur.fetchall()
print_table_info("ratings", ratings_info)

# Get and print information for the 'links' table
cur.execute("PRAGMA table_info(links);")
links_info = cur.fetchall()
print_table_info("links", links_info)

# Get and print information for the 'tags' table
cur.execute("PRAGMA table_info(tags);")
tags_info = cur.fetchall()
print_table_info("tags", tags_info)


Table: movies
0: movieId: INT
1: title: TEXT
2: year: INT
3: genres: TEXT

Table: ratings
0: userId: INT
1: movieId: INT
2: rating: REAL
3: timestamp: INT

Table: links
0: movieId: INT
1: imdbId: TEXT
2: tmdbId: TEXT

Table: tags
0: userId: INT
1: movieId: INT
2: tag: TEXT
3: timestamp: NUM


In [6]:
#For SQLite JDBC driver, it can be downloaded via:

!curl -O https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7125k  100 7125k    0     0  51.9M      0 --:--:-- --:--:-- --:--:-- 51.9M


In [7]:
# install sqlite-jdbc-3.34.0.jar to the spark/jars directory

!sudo cp sqlite-jdbc-3.34.0.jar spark-3.1.1-bin-hadoop3.2/jars/

#check the jar file is in the spark/jars directory

!ls spark-3.1.1-bin-hadoop3.2/jars/

HikariCP-2.5.1.jar
JLargeArrays-1.5.jar
JTransforms-3.1.jar
RoaringBitmap-0.9.0.jar
ST4-4.0.4.jar
accessors-smart-1.2.jar
activation-1.1.1.jar
aircompressor-0.10.jar
algebra_2.12-2.0.0-M2.jar
antlr-runtime-3.5.2.jar
antlr4-runtime-4.8-1.jar
aopalliance-1.0.jar
aopalliance-repackaged-2.6.1.jar
arpack_combined_all-0.1.jar
arrow-format-2.0.0.jar
arrow-memory-core-2.0.0.jar
arrow-memory-netty-2.0.0.jar
arrow-vector-2.0.0.jar
audience-annotations-0.5.0.jar
automaton-1.11-8.jar
avro-1.8.2.jar
avro-ipc-1.8.2.jar
avro-mapred-1.8.2-hadoop2.jar
bonecp-0.8.0.RELEASE.jar
breeze-macros_2.12-1.0.jar
breeze_2.12-1.0.jar
cats-kernel_2.12-2.0.0-M4.jar
chill-java-0.9.5.jar
chill_2.12-0.9.5.jar
commons-beanutils-1.9.4.jar
commons-cli-1.2.jar
commons-codec-1.10.jar
commons-collections-3.2.2.jar
commons-compiler-3.0.16.jar
commons-compress-1.20.jar
commons-configuration2-2.1.1.jar
commons-crypto-1.1.0.jar
commons-daemon-1.0.13.jar
commons-dbcp-1.4.jar
commons-httpclient-3.1.jar
commons-io-2.5.jar
commons-l

In [8]:
# create a dataframe with including userid, movieid, genre and rating via pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Read the ratings table into a dataframe
ratings_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:sqlite:Datasets/movielens-small.db") \
    .option("dbtable", "ratings") \
    .load()

# Read the movies table into a dataframe
movies_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:sqlite:Datasets/movielens-small.db") \
    .option("dbtable", "movies") \
    .load()

# Read the links table into a dataframe
links_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:sqlite:Datasets/movielens-small.db") \
    .option("dbtable", "links") \
    .load()

# Read the tags table into a dataframe
tags_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:sqlite:Datasets/movielens-small.db") \
    .option("dbtable", "tags") \
    .load()

In [9]:
# Write a query to create a dataframe with including userid, movieid, genre and rating


# join ratings and movies dataframes on movieid column and create a new dataframe called ratings_movies_df via pyspark dataframe
ratings_movies_df = ratings_df.join(movies_df, on="movieId", how="left")

# show the userid, movieid, genre and rating columns of ratings_movies_df dataframe
ratings_movies_df.select("userId", "movieId", "genres", "rating").show(10)

#count the number of rows in ratings_movies_df dataframe
ratings_movies_df.count()


                                                                                

+------+-------+------+------+
|userId|movieId|genres|rating|
+------+-------+------+------+
|    14|     26| Drama|   4.0|
|    31|     26| Drama|   2.0|
|    51|     26| Drama|   4.0|
|    79|     26| Drama|   4.0|
|   156|     26| Drama|   4.0|
|   161|     26| Drama|   3.0|
|   203|     26| Drama|   4.0|
|   219|     26| Drama|   3.0|
|   220|     26| Drama|   2.5|
|   228|     26| Drama|   4.0|
+------+-------+------+------+
only showing top 10 rows



                                                                                

100023

In [10]:
# Count ratings for each movie, and list top 5 movies with the highest value

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Calculate the count of ratings for each movie
movie_rating_count_df = ratings_movies_df.groupBy("movieId").agg(
    F.count("rating").alias("rating_count")
)

# Join the top_movies_df with the movies_df dataframe to get the movie titles
top_movies_df = movie_rating_count_df.join(movies_df, on="movieId", how="left")

top_movies_df.select("movieId", "title", "rating_count").sort("rating_count", ascending=False).show(5, truncate=False)





+-------+-------------------------+------------+
|movieId|title                    |rating_count|
+-------+-------------------------+------------+
|593    |Silence of the Lambs, The|337         |
|318    |Shawshank Redemption, The|328         |
|296    |Pulp Fiction             |327         |
|480    |Jurassic Park            |324         |
|356    |Forrest Gump             |318         |
+-------+-------------------------+------------+
only showing top 5 rows



                                                                                

In [11]:
# Find and list top 5 most rated genres

genre_rating_count_df = ratings_movies_df.groupBy("genres").agg(
    F.count("rating").alias("rating_count")
)

genre_rating_count_df.sort("rating_count", ascending=False).show(5, truncate=False)




+--------------+------------+
|genres        |rating_count|
+--------------+------------+
|Drama         |7008        |
|Comedy        |6396        |
|Comedy|Romance|3877        |
|Drama|Romance |3121        |
|Comedy|Drama  |3000        |
+--------------+------------+
only showing top 5 rows



                                                                                

In [12]:
#Find and list top 5 most rated tags except null values

tags_rating_count_df = ratings_movies_df.join(tags_df, on="movieId", how="left").groupBy("tag").agg(
    F.count("rating").alias("rating_count")).filter(tags_df.tag.isNotNull())

tags_rating_count_df.sort("rating_count", ascending=False).show(5, truncate=False)







+------------+------------+
|tag         |rating_count|
+------------+------------+
|drama       |3542        |
|sci-fi      |3035        |
|twist ending|2998        |
|psychology  |2672        |
|crime       |2570        |
+------------+------------+
only showing top 5 rows



                                                                                

In [13]:
# By using timestamp from ratings table, provide top 5 most frequent users within a week

genre_rating_count_df = ratings_movies_df.groupBy("userId").agg(
    F.count("timestamp").alias("timestamp_count")
)

genre_rating_count_df.sort("timestamp_count", ascending=False).show(5, truncate=False)





+------+---------------+
|userId|timestamp_count|
+------+---------------+
|516   |2268           |
|384   |1412           |
|187   |1338           |
|31    |1283           |
|377   |1241           |
+------+---------------+
only showing top 5 rows



                                                                                

In [14]:
# Calculate average ratings for each genre, and plot average ratings of top 10 genres with descending order

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Calculate the average rating for each genre
genre_rating_avg_df = ratings_movies_df.groupBy("genres").agg(
    F.avg("rating").alias("rating_avg")
)

# Sort the dataframe by rating_avg column
genre_rating_avg_df.sort("rating_avg", ascending=False).show(10, truncate=False)




+--------------------------------------------------------+----------+
|genres                                                  |rating_avg|
+--------------------------------------------------------+----------+
|Animation|Documentary                                   |5.0       |
|Action|Adventure|Animation|Comedy|Fantasy|Mystery|Sci-Fi|5.0       |
|Crime|Documentary|War                                   |5.0       |
|Adventure|Fantasy|Mystery                               |5.0       |
|Crime|Horror|Mystery                                    |4.75      |
|Adventure|Comedy|Fantasy|Musical                        |4.5       |
|Animation|Comedy|Horror|IMAX                            |4.5       |
|Adventure|Crime|Drama|Horror|Mystery                    |4.5       |
|Adventure|Comedy|Crime|Drama|Romance                    |4.5       |
|Action|Animation|Crime|Sci-Fi|Thriller                  |4.5       |
+--------------------------------------------------------+----------+
only showing top 10 

                                                                                

## TASK 2 - Recommender Design

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [16]:
spark = SparkSession.builder.appName("movieRecommendation").getOrCreate() 

In [17]:
# using movie rating data to probide implicit feature using ALS(Alternate Least Squares)

movie_rating_df = ratings_df




In [18]:
# description of created schema 
movie_rating_df.printSchema()

root
 |-- userId: long (nullable = true)
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)



In [19]:
#splitting dataset to train the model as 80% for train and remaining for test data.
(train, test) = movie_rating_df.randomSplit([0.8, 0.2], seed=87)

In [20]:
# 1st Recommender model - Alternating Least Square (ALS) Matrix Factorization in Collaborative Filtering on rating (as actual values) 

als = ALS(rank=10, maxIter=15, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

model = als.fit(train)

pred = model.transform(test)

pred = pred.selectExpr("userId as userId","movieId as movieId","rating as rating","prediction as implicit")

pred.show(truncate=False)

24/01/23 15:00:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/01/23 15:00:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/01/23 15:00:07 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/01/23 15:00:07 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK

+------+-------+------+---------+
|userId|movieId|rating|implicit |
+------+-------+------+---------+
|31    |471    |4.5   |1.8935201|
|159   |471    |4.0   |3.7890081|
|348   |471    |4.0   |2.2790236|
|40    |471    |4.0   |3.997721 |
|447   |471    |5.0   |3.0354795|
|489   |471    |5.0   |3.5236747|
|114   |471    |4.0   |3.1626875|
|100   |471    |4.0   |3.3876393|
|450   |471    |2.0   |2.8211172|
|555   |471    |3.5   |4.635881 |
|7     |471    |4.0   |3.3934188|
|214   |471    |3.0   |2.947355 |
|221   |471    |5.0   |3.4761927|
|284   |471    |1.0   |3.36733  |
|153   |471    |3.0   |3.7567434|
|199   |833    |5.0   |3.5096498|
|596   |1088   |3.0   |2.5451236|
|696   |1088   |5.0   |4.0491714|
|581   |1088   |4.0   |4.068609 |
|339   |1088   |4.5   |2.9458978|
+------+-------+------+---------+
only showing top 20 rows



                                                                                

In [21]:
#calculating RMSE and MAE to evaluate performance of the models. 

eval_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="implicit")
eval_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="implicit")


rmse = eval_rmse.evaluate(pred)
mae = eval_mae.evaluate(pred)


print("RMSE of ALS:", rmse)
print("MAE of ALS:", mae)






RMSE of ALS: 1.2222768603660332
MAE of ALS: 0.90745209519291


                                                                                

In [22]:
# 2nd Recommender model - Alternating Least Square (ALS) Matrix Factorization in Collaborative Filtering on designed implicit feedback values 


(train_implicit, test_implicit) = pred.randomSplit([0.8, 0.2], seed=87)

als_implicit = ALS(rank=10, maxIter=15, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="implicit", coldStartStrategy="drop")

model_implicit = als_implicit.fit(train_implicit)

pred_implicit = model_implicit.transform(test_implicit)

pred_implicit.show(truncate=False)



+------+-------+------+---------+----------+
|userId|movieId|rating|implicit |prediction|
+------+-------+------+---------+----------+
|159   |471    |4.0   |3.7890081|2.794365  |
|489   |471    |5.0   |3.5236747|4.941455  |
|114   |471    |4.0   |3.1626875|2.0920568 |
|214   |471    |3.0   |2.947355 |3.9414387 |
|221   |471    |5.0   |3.4761927|3.3370953 |
|284   |471    |1.0   |3.36733  |3.0591292 |
|596   |1088   |3.0   |2.5451236|1.120658  |
|339   |1088   |4.5   |2.9458978|1.9720254 |
|416   |1580   |5.0   |4.1063194|2.7542517 |
|48    |1580   |5.0   |4.592147 |3.13164   |
|37    |1580   |4.5   |2.7149873|4.3887815 |
|438   |1580   |4.0   |3.104217 |3.0729005 |
|384   |1591   |2.5   |2.1793556|1.8413986 |
|246   |1591   |3.5   |3.771912 |1.8602619 |
|384   |1645   |3.5   |3.6046662|4.448451  |
|8     |1645   |3.0   |4.1210723|4.0557103 |
|354   |1645   |4.0   |4.369527 |0.9570819 |
|665   |3175   |5.0   |4.204756 |4.3620677 |
|320   |3175   |3.0   |2.672224 |2.650245  |
|221   |31

                                                                                

In [23]:
eval_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
eval_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")


rmse = eval_rmse.evaluate(pred_implicit)
mae = eval_mae.evaluate(pred_implicit)


print("RMSE of ALS_Implicit Feedback:", rmse)
print("MAE of ALS_Implicit Feedback:", mae)



RMSE of ALS_Implicit Feedback: 1.7470276604705446
MAE of ALS_Implicit Feedback: 1.3695158293566574


                                                                                

When compared these two models, the 1st model ( ALS on rating ) shows better performance compared to 2nd model (ALS on implicit feedback) according to error metrics such as Root Mean Square Error (RMSE) and Mean Absolute Error (MAE). 

## Task – 3 Text Analysis

In [24]:
spark = SparkSession.builder.appName("textAnalysis").getOrCreate()

In [25]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf


In [26]:
# download the data in Dataset folder.



!curl -o Datasets/aclImdb_v1.tar.gz https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  7287k      0  0:00:11  0:00:11 --:--:-- 15.0M


In [27]:
#Extract the tar file as dataset

import tarfile

def tarfile_extract(tar_file, output_dir=os.getcwd()+'/Datasets'):
    tar = tarfile.open(tar_file, 'r:gz')
    total_files = sum(1 for _ in tar)
    tar.extractall(output_dir, members=extract_progress(tar, total_files))
    tar.close()

def extract_progress(tar, total_files):
    for member in tar:
        yield member
        total_files -= 1
        print(f"Remaining files: {total_files}", end='\r')
    print("\nExtraction completed.")

wd = os.getcwd()
tarfile_extract(wd+'/Datasets/aclImdb_v1.tar.gz')


Remaining files: 000000
Extraction completed.


In [28]:

#using 'alldata' list to store all the files in the directories
alldata=[]


#collecting data in train/pos folder
for fname in os.listdir(wd+'/Datasets/aclImdb/train/pos'):
    with open(os.path.join(wd+'/Datasets/aclImdb/train/pos', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'train','pos'))

#collecting data in train/neg folder
for fname in os.listdir(wd+'/Datasets/aclImdb/train/neg'):
    with open(os.path.join(wd+'/Datasets/aclImdb/train/neg', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'train','neg'))
#collecting data in test/pos folder
for fname in os.listdir(wd+'/Datasets/aclImdb/test/pos'):
    with open(os.path.join(wd+'/Datasets/aclImdb/test/pos', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'test','pos'))
#collecting data in test/neg folder
for fname in os.listdir(wd+'/Datasets/aclImdb/test/neg'):
    with open(os.path.join(wd+'/Datasets/aclImdb/test/neg', fname), encoding = 'utf-8') as infile:
        for line in infile:
            alldata.append((line,'test','neg'))




In [29]:

from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

appName = "list to Spark Data Frame"
master = "local"

# Create Spark session
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

# List
data = alldata

# Create a schema for the dataframe
schema = StructType([
    StructField('content', StringType(), True),
    StructField('label', StringType(), True),
    StructField('sentiemtn', StringType(), True)
])

# Convert list to RDD
rdd = spark.sparkContext.parallelize(data)

# Create data frame
df = spark.createDataFrame(rdd,schema)
print(df.schema)
df.show()

StructType(List(StructField(content,StringType,true),StructField(label,StringType,true),StructField(sentiemtn,StringType,true)))


24/01/23 15:02:36 WARN TaskSetManager: Stage 716 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.
[Stage 716:>                                                        (0 + 1) / 1]

+--------------------+-----+---------+
|             content|label|sentiemtn|
+--------------------+-----+---------+
|Man, this is a ha...|train|      pos|
|The Color Purple ...|train|      pos|
|Randolph Scott is...|train|      pos|
|High energy Raoul...|train|      pos|
|One of the great ...|train|      pos|
|Although I'm grat...|train|      pos|
|It is hard to des...|train|      pos|
|- Having grown ti...|train|      pos|
|This movie is fun...|train|      pos|
|It was considered...|train|      pos|
|Other commentator...|train|      pos|
|I saw this movie ...|train|      pos|
|So i consider mys...|train|      pos|
|My mother took me...|train|      pos|
|After 21 movies a...|train|      pos|
|I have a six mont...|train|      pos|
|...On stage, TV o...|train|      pos|
|I had to see this...|train|      pos|
|Although at one p...|train|      pos|
|Hot Millions is a...|train|      pos|
+--------------------+-----+---------+
only showing top 20 rows



                                                                                

In [30]:
df.count()

24/01/23 15:02:38 WARN TaskSetManager: Stage 717 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

50000

In [31]:
# Schema of created Spark Dataframe

df.printSchema()

root
 |-- content: string (nullable = true)
 |-- label: string (nullable = true)
 |-- sentiemtn: string (nullable = true)



###Tokenization

In [32]:
#using RegexTokenizer for tokenizing contents

tokenizer = RegexTokenizer(inputCol="content", outputCol="tokenized_content", pattern="\\W") # used regexp to determine pattern as 'not word'

countTokens = udf(lambda w: len(w), IntegerType())

tokenized = tokenizer.transform(df)

tokenized.show()

24/01/23 15:02:39 WARN TaskSetManager: Stage 719 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+-----+---------+--------------------+
|             content|label|sentiemtn|   tokenized_content|
+--------------------+-----+---------+--------------------+
|Man, this is a ha...|train|      pos|[man, this, is, a...|
|The Color Purple ...|train|      pos|[the, color, purp...|
|Randolph Scott is...|train|      pos|[randolph, scott,...|
|High energy Raoul...|train|      pos|[high, energy, ra...|
|One of the great ...|train|      pos|[one, of, the, gr...|
|Although I'm grat...|train|      pos|[although, i, m, ...|
|It is hard to des...|train|      pos|[it, is, hard, to...|
|- Having grown ti...|train|      pos|[having, grown, t...|
|This movie is fun...|train|      pos|[this, movie, is,...|
|It was considered...|train|      pos|[it, was, conside...|
|Other commentator...|train|      pos|[other, commentat...|
|I saw this movie ...|train|      pos|[i, saw, this, mo...|
|So i consider mys...|train|      pos|[so, i, consider,...|
|My mother took me...|train|      pos|[m

### Removing Stop Words

In [33]:
type(tokenized)

pyspark.sql.dataframe.DataFrame

In [34]:
from pyspark.ml.feature import StopWordsRemover

tokenized.show()

+--------------------+-----+---------+--------------------+
|             content|label|sentiemtn|   tokenized_content|
+--------------------+-----+---------+--------------------+
|Man, this is a ha...|train|      pos|[man, this, is, a...|
|The Color Purple ...|train|      pos|[the, color, purp...|
|Randolph Scott is...|train|      pos|[randolph, scott,...|
|High energy Raoul...|train|      pos|[high, energy, ra...|
|One of the great ...|train|      pos|[one, of, the, gr...|
|Although I'm grat...|train|      pos|[although, i, m, ...|
|It is hard to des...|train|      pos|[it, is, hard, to...|
|- Having grown ti...|train|      pos|[having, grown, t...|
|This movie is fun...|train|      pos|[this, movie, is,...|
|It was considered...|train|      pos|[it, was, conside...|
|Other commentator...|train|      pos|[other, commentat...|
|I saw this movie ...|train|      pos|[i, saw, this, mo...|
|So i consider mys...|train|      pos|[so, i, consider,...|
|My mother took me...|train|      pos|[m

24/01/23 15:02:40 WARN TaskSetManager: Stage 720 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


In [35]:
df_tokenized = tokenized.select("tokenized_content").withColumn("tokenCount", countTokens(col("tokenized_content")))

In [36]:
type(df_tokenized)

pyspark.sql.dataframe.DataFrame

In [37]:
# SWR -> stop word remover
SWR  = StopWordsRemover (inputCol='tokenized_content', outputCol='SWRed')


#See the result of removal operation
SWR.transform(df_tokenized).select('SWRed').show(truncate=False)

24/01/23 15:02:40 WARN TaskSetManager: Stage 721 contains a task of very large size (32123 KiB). The maximum recommended task size is 1000 KiB.


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------