In [95]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_unixtime, col, udf
from pyspark.sql.types import StringType
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("YourApp") \
    .config("spark.executorEnv.PYSPARK_PYTHON", "C:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python310\\python.exe") \
    .config("spark.executorEnv.PYSPARK_DRIVER_PYTHON", "C:\\Users\\HP\\AppData\\Local\\Programs\\Python\\Python310\\python.exe") \
    .getOrCreate()

In [96]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))
# def preprocess_text_udf(text):
#     tokens = nltk.word_tokenize(text.lower())
#     tokens = [word for word in tokens if word.isalnum()]
#     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     return ' '.join(tokens)

# # Register the UDF with Spark
# preprocess_text_spark_udf = udf(preprocess_text_udf, StringType())

In [97]:
df = spark.read.json("EDA/reddit-posts.json")

In [98]:
columns_to_keep = ['author_fullname', 'title', 'hide_score', 'name','link_flair_text_color', 'upvote_ratio', 'ups', 'total_awards_received',
                    'link_flair_text', 'author_premium', 'edited', 'created','link_flair_type', 'no_follow', 'over_18','link_flair_background_color',
                    'id', 'num_comments', 'send_replies', 'url', 'link_flair_template_id','author_cakeday']

filtered_df = df[columns_to_keep]

In [99]:
# Convert created (epoch format) --> timestamp:
filtered_df = filtered_df.withColumn("created", from_unixtime(col("created")))

In [100]:
# Create a copy of the original title
filtered_df_copy_title = filtered_df.withColumn("original_title", col("title").alias("original_title"))

In [101]:
filtered_df_copy_title.show()

+---------------+--------------------+----------+----------+---------------------+------------+---+---------------------+--------------------+--------------+------+-------------------+---------------+---------+-------+---------------------------+-------+------------+------------+--------------------+----------------------+--------------+--------------------+
|author_fullname|               title|hide_score|      name|link_flair_text_color|upvote_ratio|ups|total_awards_received|     link_flair_text|author_premium|edited|            created|link_flair_type|no_follow|over_18|link_flair_background_color|     id|num_comments|send_replies|                 url|link_flair_template_id|author_cakeday|      original_title|
+---------------+--------------------+----------+----------+---------------------+------------+---+---------------------+--------------------+--------------+------+-------------------+---------------+---------+-------+---------------------------+-------+------------+-----------

In [102]:
# Preprocessing of the column title to better performance of the model 

# filtered_df_copy_title = filtered_df_copy_title.withColumn("title", preprocess_text_spark_udf(col("original_title")))

In [103]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialize SparkSession
spark = SparkSession.builder.appName("TitleVectorization").getOrCreate()

# Assuming you have a DataFrame called filtered_df_copy_title
title_data = filtered_df_copy_title.select("title")

# Define a UDF for preprocessing
@udf(StringType())
def preprocess_text_udf(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply the UDF to the DataFrame to get the preprocessed text
title_data = title_data.withColumn("preprocessed_title", preprocess_text_udf(title_data["title"]))

# Tokenize the "preprocessed_title" column
tokenizer = Tokenizer(inputCol="preprocessed_title", outputCol="words")
words_data = tokenizer.transform(title_data)

# Apply HashingTF to get term frequencies
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1500)
featurized_data = hashingTF.transform(words_data)

# Apply IDF to get TF-IDF vectors
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
idf_model = idf.fit(featurized_data)
tfidf_vectors = idf_model.transform(featurized_data)

In [104]:
tfidf_vectors.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|  preprocessed_title|               words|         rawFeatures|      tfidf_features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|What is 70 Years ...|       70 year young|   [70, year, young]|(1500,[399,857,13...|(1500,[399,857,13...|
|What was being a ...|        teen 80 like|    [teen, 80, like]|(1500,[330,352,14...|(1500,[330,352,14...|
|How do you know m...|   know mermaid real|[know, mermaid, r...|(1500,[924,1191,1...|(1500,[924,1191,1...|
|What's the bigges...|biggest challenge...|[biggest, challen...|(1500,[38,408,598...|(1500,[38,408,598...|
|What is the most ...|unexpected thing ...|[unexpected, thin...|(1500,[61,336,357...|(1500,[61,336,357...|
|What age should p...|age people allowe...|[age, people, all...|(1500,[292,434,58...|(1500,[292,434,58...|
|teachers of reddi...|teacher reddit 

In [105]:
# import joblib
# trained_model = joblib.load("trained_model.joblib")

In [118]:
from pyspark.ml.linalg import DenseVector
import numpy as np

# Assuming tfidf_vectors is a PySpark DataFrame with a column named "tfidf_features"
# Extract the "tfidf_features" column as a DenseVector
tfidf_vectors_dense = tfidf_vectors.select("tfidf_features").rdd.map(lambda x: x[0]).collect()

# Convert the DenseVector to a list of NumPy arrays
tfidf_vectors_numpy = [np.array(vec.toArray()) for vec in tfidf_vectors_dense]

# Predict the category using the model
predictions = [trained_model.predict(vec.reshape(1, -1)) for vec in tfidf_vectors_numpy]



In [117]:
# filtered_df_copy_title_category.select(col("original_title"),col("predicted_category")).show(100,truncate=False)


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|original_title                                                                                                                                                                                                                                                              |predicted_category|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|What is 70 Years too young for ?                                                                                                 