<a href="https://colab.research.google.com/github/MurtuzaQuantumCoder/Murtuza_U2972802/blob/main/Murtuza_U2972802_MachineLearning_LabWork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [2]:
!pip install nltk



In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [6]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [7]:
def stem_words(words):
    return [stemmer.stem(word) for word in words]

In [8]:
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

In [9]:
spark = SparkSession.builder.appName("DocumentClassificationTFIDF").getOrCreate()

In [18]:
data = [(0,"Cloud computing is becoming increasingly important for businesses","Technology"),
(1, "Basketball players are preparing for the next tournament", "Sports"),
(2,"Machine learning has revolutionized the way data is processed","Technology"),
(3, "Political campaigns are gearing up for the upcoming elections","Politics"),
(4, "The football team has been training hard for the upcoming season", "Sports"),
(5,"International relations are being discussed in diplomatic meetings","Politics"),
(6, "The president addressed the nation in a live broadcast", "Politics"),
(7, "Tennis players are practicing for the grand slam matches", "Sports"),
(8,"Machine learning has revolutionized the way data is processed","Technology"),
(9, "Stock trading has become a popular way to build wealth", "Finance"),
(10, "5G technology is expected to significantly improve communication speeds","Technology"),
(11,"Political campaigns are gearing up for the upcoming elections","Politics"),
(12,
 "Machine learning has revolutionized the way data is processed",
 "Technology"),
(13,
 "Investors are looking for high-yield bonds in the current market",
 "Finance"),
(14,
 "Quantum computing holds promise for solving complex problems",
 "Technology"),
(15,
 "Cloud computing is becoming increasingly important for businesses",
 "Technology"),
(16, "The Formula 1 race track is set to host the next grand prix", "Sports"),
(17,
 "Financial experts are advising on diversifying investment portfolios",
 "Finance"),
(18, "Stock trading has become a popular way to build wealth", "Finance"),
(19, "Stock trading has become a popular way to build wealth", "Finance"),
(20,
 "International relations are being discussed in diplomatic meetings",
 "Politics"),
(21, "The Formula 1 race track is set to host the next grand prix", "Sports"),
(22,
 "Political campaigns are gearing up for the upcoming elections",
 "Politics")
]

columns = ["id", "text", "category"]
df = spark.createDataFrame(data, columns)
display(df.head())

Row(id=0, text='Cloud computing is becoming increasingly important for businesses', category='Technology')

In [11]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)

In [12]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

In [13]:
stem_udf = udf(stem_words, ArrayType(StringType()))

In [14]:
df = df.withColumn("stemmed_words", stem_udf(col("filtered_words")))

In [15]:
lemma_udf = udf(lemmatize_words, ArrayType(StringType()))

In [16]:
df = df.withColumn("lemmatized_words", lemma_udf(col("filtered_words")))

In [20]:
df.select("text","filtered_words","stemmed_words","lemmatized_words").show(truncate=False)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `filtered_words` cannot be resolved. Did you mean one of the following? [`category`, `id`, `text`].;
'Project [text#70, 'filtered_words, 'stemmed_words, 'lemmatized_words]
+- LogicalRDD [id#69L, text#70, category#71], false


In [21]:
df.select("text", "filtered_words", "stemmed_words", "lemmatized_words").show(truncate=False)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `filtered_words` cannot be resolved. Did you mean one of the following? [`category`, `id`, `text`].;
'Project [text#70, 'filtered_words, 'stemmed_words, 'lemmatized_words]
+- LogicalRDD [id#69L, text#70, category#71], false


# Task
The user wants to preprocess text data in a Spark DataFrame by tokenizing, removing stop words, stemming, and lemmatizing. The user has provided the code for these steps and wants to understand and fix an error in the provided code. The user also wants to display the results of the preprocessing steps.

## Re-run tokenization

### Subtask:
Re-execute the tokenization step to create the 'words' column.


**Reasoning**:
Re-executing the tokenization step is necessary to ensure the 'words' column is present before subsequent transformations.



In [22]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)

**Reasoning**:
The previous step successfully re-created the 'words' column. Now, re-execute the stop words removal step to create the 'filtered_words' column, which was missing based on the error message.



In [23]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

**Reasoning**:
Re-executing the stemming step is necessary to ensure the 'stemmed_words' column is present.



In [24]:
df = df.withColumn("stemmed_words", stem_udf(col("filtered_words")))

**Reasoning**:
Re-executing the lemmatization step is necessary to ensure the 'lemmatized_words' column is present.



In [25]:
df = df.withColumn("lemmatized_words", lemma_udf(col("filtered_words")))

**Reasoning**:
All the preprocessing steps have been re-executed. Now, display the relevant columns to verify the output and complete the subtask.



In [26]:
df.select("text", "filtered_words", "stemmed_words", "lemmatized_words").show(truncate=False)

+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|text                                                                   |filtered_words                                                           |stemmed_words                                                |lemmatized_words                                                        |
+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|Cloud computing is becoming increasingly important for businesses      |[cloud, computing, becoming, increasingly, important, businesses]        |[clo

## Re-run tokenization

### Subtask:
Re-execute the tokenization step to create the 'words' column.


**Reasoning**:
Re-execute the tokenization step to create the 'words' column.



In [27]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)

IllegalArgumentException: Output column words already exists.

**Reasoning**:
The error indicates that the 'words' column already exists. Since the previous execution failed before displaying the results, the DataFrame `df` still contains the columns from the previous successful steps. I will proceed with the stop words removal step, as the 'words' column should be present.



In [28]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

IllegalArgumentException: requirement failed: Output Column filtered_words already exists.

**Reasoning**:
Similar to the previous error, the 'filtered_words' column also exists. This indicates that the previous successful executions have already created this column. I will proceed with the stemming step.



In [29]:
df = df.withColumn("stemmed_words", stem_udf(col("filtered_words")))

**Reasoning**:
The stemming step was successful, and the 'stemmed_words' column should now be updated. I will proceed with the lemmatization step.



In [30]:
df = df.withColumn("lemmatized_words", lemma_udf(col("filtered_words")))

**Reasoning**:
All the preprocessing steps (tokenization, stop words removal, stemming, and lemmatization) should now be completed and the corresponding columns should exist in the DataFrame. I will display the relevant columns to show the results of the preprocessing.



In [31]:
df.select("text", "filtered_words", "stemmed_words", "lemmatized_words").show(truncate=False)

+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|text                                                                   |filtered_words                                                           |stemmed_words                                                |lemmatized_words                                                        |
+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|Cloud computing is becoming increasingly important for businesses      |[cloud, computing, becoming, increasingly, important, businesses]        |[clo

## Display processed data

### Subtask:
Display the 'text', 'filtered_words', 'stemmed_words', and 'lemmatized_words' columns to verify the preprocessing.


**Reasoning**:
Display the specified columns from the DataFrame to verify the preprocessing steps.



In [32]:
df.select("text", "filtered_words", "stemmed_words", "lemmatized_words").show(truncate=False)

+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|text                                                                   |filtered_words                                                           |stemmed_words                                                |lemmatized_words                                                        |
+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|Cloud computing is becoming increasingly important for businesses      |[cloud, computing, becoming, increasingly, important, businesses]        |[clo

## Summary:

### Data Analysis Key Findings

*   The initial error was caused by the absence of the 'words' column, which is the output of the tokenization step.
*   Re-executing the tokenization, stop word removal, stemming, and lemmatization steps in the correct sequence resolved the issue.
*   The final DataFrame contains the original text, tokenized words (with stop words removed), stemmed words, and lemmatized words in separate columns.

### Insights or Next Steps

*   Ensure that all necessary intermediate columns are created in the correct order during sequential data processing steps.
*   Verify the output of each major preprocessing step to catch errors early in the process.


In [33]:
df.select("text", "filtered_words", "stemmed_words", "lemmatized_words").show(truncate=False)

+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|text                                                                   |filtered_words                                                           |stemmed_words                                                |lemmatized_words                                                        |
+-----------------------------------------------------------------------+-------------------------------------------------------------------------+-------------------------------------------------------------+------------------------------------------------------------------------+
|Cloud computing is becoming increasingly important for businesses      |[cloud, computing, becoming, increasingly, important, businesses]        |[clo

In [34]:
# Compute TF (Term Frequency) using HashingTF
hashingTF = HashingTF(inputCol="lemmatized_words", outputCol="raw_features", numFeatures=500)
df = hashingTF.transform(df)

# Compute IDF (Inverse Document Frequency)
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df)
df = idf_model.transform(df)

# Show TF-IDF Features
df.select("text", "features").show(truncate=False)

+-----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                   |features                                                                                                                                                                                                             |
+-----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Cloud computing is becoming increasingly important for businesses      |(500,[13,287,345,374,467,480],[2.0794415416798357,2.07944154167

In [35]:
indexer = StringIndexer(inputCol="category", outputCol="label")
df = indexer.fit(df).transform(df)
df.select("category", "label").distinct().show()
# Split Data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Train Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)

# Predictions
predictions = lr_model.transform(test_data)
predictions.select("text", "category", "prediction").show(truncate=False)

# Evaluate Model Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_tf_idf = evaluator.evaluate(predictions)
print(f"TF-IDF Model Accuracy: {accuracy_tf_idf:.2f}")

+----------+-----+
|  category|label|
+----------+-----+
|Technology|  0.0|
|    Sports|  3.0|
|   Finance|  2.0|
|  Politics|  1.0|
+----------+-----+

+--------------------------------------------------------------------+----------+----------+
|text                                                                |category  |prediction|
+--------------------------------------------------------------------+----------+----------+
|Machine learning has revolutionized the way data is processed       |Technology|0.0       |
|The president addressed the nation in a live broadcast              |Politics  |2.0       |
|Machine learning has revolutionized the way data is processed       |Technology|0.0       |
|Political campaigns are gearing up for the upcoming elections       |Politics  |1.0       |
|The Formula 1 race track is set to host the next grand prix         |Sports    |3.0       |
|Financial experts are advising on diversifying investment portfolios|Finance   |1.0       |
+---------

In [36]:
df.select("category", "label").distinct().show()

+----------+-----+
|  category|label|
+----------+-----+
|Technology|  0.0|
|    Sports|  3.0|
|   Finance|  2.0|
|  Politics|  1.0|
+----------+-----+



In [37]:
from pyspark.ml.feature import Word2Vec # Import Word2Vec here
word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="lemmatized_words", outputCol="featuresW2Vector")
word2Vec_model = word2Vec.fit(df)
df = word2Vec_model.transform(df)

df.select("text", "featuresW2Vector").show(truncate=False)

+-----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [38]:
train_data_w2v, test_data_w2v = df.randomSplit([0.8, 0.2], seed=42)

# Train Model
lr_w2v = LogisticRegression(featuresCol="featuresW2Vector", labelCol="label")
lr_w2v_model = lr_w2v.fit(train_data_w2v)

# Predictions
predictions_w2v = lr_w2v_model.transform(test_data_w2v)
predictions_w2v.select("text", "category", "prediction").show(truncate=False)

# Evaluate Model Accuracy
accuracy_w2v = evaluator.evaluate(predictions_w2v)
print(f"Word2Vec Model Accuracy: {accuracy_w2v:.2f}")

+--------------------------------------------------------------------+----------+----------+
|text                                                                |category  |prediction|
+--------------------------------------------------------------------+----------+----------+
|Machine learning has revolutionized the way data is processed       |Technology|0.0       |
|The president addressed the nation in a live broadcast              |Politics  |0.0       |
|Machine learning has revolutionized the way data is processed       |Technology|0.0       |
|Political campaigns are gearing up for the upcoming elections       |Politics  |1.0       |
|The Formula 1 race track is set to host the next grand prix         |Sports    |3.0       |
|Financial experts are advising on diversifying investment portfolios|Finance   |3.0       |
+--------------------------------------------------------------------+----------+----------+

Word2Vec Model Accuracy: 0.67
