In [73]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
!pip install pyspark



In [75]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import FloatType, IntegerType
from pyspark.sql.functions import col, when
from pyspark.sql.functions import udf
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

In [76]:
spark = SparkSession.builder.appName("Model").config("spark.executor.memory","4g").getOrCreate()

In [77]:
import html
schema = " free_text string, label_id int"
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

IN_PATH_RAW = "/content/drive/MyDrive/yt_comment_sentiment/train.csv"
IN_PATH_TEST = "/content/drive/MyDrive/yt_comment_sentiment/test.csv"

spark_reader = spark.read.schema(schema)

user_regex = r"(@\w{1,15})"
hashtag_regex = "(#\w{1,})"
url_regex=r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)"
email_regex=r"[\w.-]+@[\w.-]+\.[a-zA-Z]{1,}"


@f.udf
def html_unescape(s: str):
    if isinstance(s, str):
        return html.unescape(s)
    return s


def clean_data(df):
    df = (
        df
        .withColumn("original_text", f.col("free_text"))
        # Remove numbers and characters from text
        .withColumn("free_text",f.regexp_replace(f.col("free_text"), "[^a-zA-ZÀ-ỹà-ỹ']", " ",))
        .withColumn("free_text", f.regexp_replace(f.col("free_text"), "'", ""))
        # Remove white space
        .withColumn("free_text",f.regexp_replace(f.col("free_text"), " +", " "))
        .withColumn("free_text",f.trim(f.col("free_text")))
        # Lowercase
        .withColumn("free_text",f.lower(f.col("free_text")))
        .withColumn("free_text", f.regexp_replace(f.col("free_text"), url_regex, ""))
        .withColumn("free_text", f.regexp_replace(f.col("free_text"), email_regex, ""))
        .withColumn("free_text", f.regexp_replace(f.col("free_text"), user_regex, ""))
        .withColumn("free_text", f.regexp_replace(f.col("free_text"), "#", " "))
        .withColumn("free_text", html_unescape(f.col("free_text")))
        .filter("free_text != ''")
    )
    return df

df_train_raw = spark_reader.csv(IN_PATH_RAW)
df_train_clean = clean_data(df_train_raw)
df_train_clean = df_train_clean.na.drop()
df_test_raw = spark_reader.csv(IN_PATH_TEST)
df_test_clean = clean_data(df_test_raw)
df_test_clean = df_test_clean.na.drop()

In [117]:
%%time
from pyspark.ml.feature import (
    StopWordsRemover,
    Tokenizer,
    HashingTF,
    IDF,
    CountVectorizer,
)
from pyspark.sql.functions import udf

with open('/content/drive/MyDrive/yt_comment_sentiment/Stopword.txt', 'r', encoding='utf-8') as file:
    vietnamese_stopwords = file.read().splitlines()

tokenizer = Tokenizer(inputCol="free_text", outputCol="words1") # chuyển sang vector
vietnamese_stopwords_remover = StopWordsRemover(inputCol="words1", outputCol="words2", stopWords=vietnamese_stopwords)

# hashing_tf = HashingTF(
#     inputCol="words2",
#     outputCol="term_frequency",
# )

vectorizer = CountVectorizer(inputCol="words2", outputCol="term_frequency")

idf = IDF(
    inputCol="term_frequency",
    outputCol="features",
    minDocFreq=5,
)
(training_data, validation_data) = df_train_clean.randomSplit([0.8, 0.2], seed=42)

CPU times: user 11.8 ms, sys: 8.8 ms, total: 20.6 ms
Wall time: 53.2 ms


# **LogisticRegression**

In [139]:
lr = LogisticRegression(labelCol='label_id')
semantic_analysis_pipeline = Pipeline(
    stages=[tokenizer, vietnamese_stopwords_remover, vectorizer, idf, lr]
)
semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

In [140]:
trained_df = semantic_analysis_model.transform(training_data)
val_df = semantic_analysis_model.transform(validation_data)
test_df = semantic_analysis_model.transform(df_test_clean)

In [141]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label_id", metricName="accuracy")

accuracy_train = evaluator.evaluate(trained_df)
accuracy_val = evaluator.evaluate(val_df)
accuracy_test = evaluator.evaluate(test_df)

print("\nTesting Training Data:")
print(f"Accuracy: {accuracy_train*100:.5f}%")
print("\nTesting Val Data:")
print(f"Accuracy: {accuracy_val*100:.5f}%")
print("\nTesting Data:")
print(f"Accuracy: {accuracy_test*100:.5f}%")


Testing Training Data:
Accuracy: 95.89654%

Testing Val Data:
Accuracy: 81.73178%

Testing Data:
Accuracy: 81.09913%


# **DecisionTreeClassifier**

In [143]:
# dt = DecisionTreeClassifier(labelCol='label_id')

In [144]:
# semantic_analysis_pipeline = Pipeline(
#     stages=[tokenizer, vietnamese_stopwords_remover, vectorizer, idf, dt]
# )
# semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

In [145]:
# trained_df = semantic_analysis_model.transform(training_data)
# val_df = semantic_analysis_model.transform(validation_data)
# test_df = semantic_analysis_model.transform(df_test_clean)

In [146]:
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# evaluator = MulticlassClassificationEvaluator(labelCol="label_id", metricName="accuracy")


# accuracy_val = evaluator.evaluate(val_df)
# accuracy_test = evaluator.evaluate(test_df)

# print("\nTesting Val Data:")
# print(f"Accuracy: {accuracy_val*100:.5f}%")
# print("\nTesting Data:")
# print(f"Accuracy: {accuracy_test*100:.5f}%")


Testing Val Data:
Accuracy: 84.07603%

Testing Data:
Accuracy: 83.74070%


# **RandomForestClassifier**

In [135]:
# rf = RandomForestClassifier(labelCol='label_id')

In [136]:
# semantic_analysis_pipeline = Pipeline(
#     stages=[tokenizer, vietnamese_stopwords_remover, vectorizer,idf, rf]
# )
# semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

In [137]:
# trained_df = semantic_analysis_model.transform(training_data)
# val_df = semantic_analysis_model.transform(validation_data)
# test_df = semantic_analysis_model.transform(df_test_clean)

In [138]:
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# evaluator = MulticlassClassificationEvaluator(labelCol="label_id", metricName="accuracy")

# accuracy_val = evaluator.evaluate(val_df)
# accuracy_test = evaluator.evaluate(test_df)

# print("\nTesting Val Data:")
# print(f"Accuracy: {accuracy_val*100:.5f}%")
# print("\nTesting Data:")
# print(f"Accuracy: {accuracy_test*100:.5f}%")


Testing Val Data:
Accuracy: 82.85111%

Testing Data:
Accuracy: 82.92090%


# save model

In [147]:
semantic_analysis_model.save('/content/drive/MyDrive/yt_comment_sentiment/sentimentModel')