In [1]:
!pip install --ignore-installed -q pyspark==3.2.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [42]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.sql.functions import col, split, array, collect_list,when,explode
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, split
from pyspark.ml.feature import StringIndexer, VectorAssembler, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [37]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Movie Rating Prediction with Logistic Regression Using MovieLens Dataset") \
    .getOrCreate()

# Load the ratings and movies datasets
ratings = spark.read.csv("/content/drive/MyDrive/ratingsmall.csv", header=True, inferSchema=True)
movies = spark.read.csv("/content/drive/MyDrive/moviesmall.csv", header=True, inferSchema=True)
#tags = spark.read.csv("/content/drive/MyDrive/tags.csv", header=True, inferSchema=True)


print("Ratings Schema:")
ratings.printSchema()

print("Movies Schema:")
movies.printSchema()

#print("Tags Schema:")
#tags.printSchema()


Ratings Schema:
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

Movies Schema:
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [38]:
# Show the first few rows of the DataFrames
ratings.show()
movies.show()
#tags.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
| 

In [39]:
# 将评分转换为二分类: 1 表示评分 >= 4.0，0 表示 < 4.0
ratings = ratings.withColumn("label", (col("rating") >= 4).cast("integer"))
ratings.printSchema()
ratings.show()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- label: integer (nullable = true)

+------+-------+------+---------+-----+
|userId|movieId|rating|timestamp|label|
+------+-------+------+---------+-----+
|     1|      1|   4.0|964982703|    1|
|     1|      3|   4.0|964981247|    1|
|     1|      6|   4.0|964982224|    1|
|     1|     47|   5.0|964983815|    1|
|     1|     50|   5.0|964982931|    1|
|     1|     70|   3.0|964982400|    0|
|     1|    101|   5.0|964980868|    1|
|     1|    110|   4.0|964982176|    1|
|     1|    151|   5.0|964984041|    1|
|     1|    157|   5.0|964984100|    1|
|     1|    163|   5.0|964983650|    1|
|     1|    216|   5.0|964981208|    1|
|     1|    223|   3.0|964980985|    0|
|     1|    231|   5.0|964981179|    1|
|     1|    235|   4.0|964980908|    1|
|     1|    260|   5.0|964981680|    1|
|     1|    296|   3.0|964982967|    

In [40]:
user_indexer = StringIndexer(inputCol="userId", outputCol="userIndex").fit(ratings)
movie_indexer = StringIndexer(inputCol="movieId", outputCol="movieIndex").fit(ratings)

ratings = user_indexer.transform(ratings)
ratings = movie_indexer.transform(ratings)


In [44]:
# 3. 处理电影的genres列
# 将genres列按'|'分割为数组
#movies = movies.withColumn("genres", split(col("genres"), "\|"))

# 4. 使用TF-IDF处理genres特征
# 计算TF向量
hashingTF = HashingTF(inputCol="genres", outputCol="rawGenreFeatures", numFeatures=20)  # 这里的numFeatures可以调整
movies = hashingTF.transform(movies)

# 计算TF-IDF向量
idf = IDF(inputCol="rawGenreFeatures", outputCol="genreFeatures")
idf_model = idf.fit(movies)
movies = idf_model.transform(movies)

In [45]:
# 4. 将处理后的movies数据与ratings数据结合
ratings_with_genres = ratings.join(movies.select("movieId", "genreFeatures"), on="movieId", how="left")

In [46]:
# 5. 将所有特征合并为一个特征向量
assembler = VectorAssembler(inputCols=["userIndex", "movieIndex", "timestamp", "genreFeatures"], outputCol="features")
ratings_with_genres = assembler.transform(ratings_with_genres)


In [47]:
# 6. 分割数据为训练集和测试集
train, test = ratings_with_genres.randomSplit([0.8, 0.2], seed=1234)

In [48]:
# 7. 逻辑回归模型
# 初始化逻辑回归模型
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# 训练逻辑回归模型
lr_model = lr.fit(train)

# 在测试集上进行预测
lr_predictions = lr_model.transform(test)

# 评估逻辑回归模型性能
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy with Binary Labels: {lr_accuracy:.4f}")

Logistic Regression Accuracy with Binary Labels: 0.5961


In [50]:
# 9. 计算逻辑回归模型的混淆矩阵
lr_conf_matrix = lr_predictions.groupBy("label", "prediction").count()
lr_conf_matrix.show()

# 计算混淆矩阵的四个核心值
tp = lr_conf_matrix.filter((col("label") == 1) & (col("prediction") == 1)).select(F.col("count")).collect()[0][0]
tn = lr_conf_matrix.filter((col("label") == 0) & (col("prediction") == 0)).select(F.col("count")).collect()[0][0]
fp = lr_conf_matrix.filter((col("label") == 0) & (col("prediction") == 1)).select(F.col("count")).collect()[0][0]
fn = lr_conf_matrix.filter((col("label") == 1) & (col("prediction") == 0)).select(F.col("count")).collect()[0][0]
# 计算TPR和FPR
tpr = tp / (tp + fn)  # True Positive Rate (Sensitivity/Recall)
fpr = fp / (fp + tn)  # False Positive Rate


print(f"Confusion Matrix for Logistic Regression:\nTP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
# 输出TPR和FPR
print(f"True Positive Rate (TPR): {tpr:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 4371|
|    0|       0.0| 6767|
|    1|       1.0| 5148|
|    0|       1.0| 3703|
+-----+----------+-----+

Confusion Matrix for Logistic Regression:
TP: 5148, TN: 6767, FP: 3703, FN: 4371
True Positive Rate (TPR): 0.5408
False Positive Rate (FPR): 0.3537


In [51]:
# 8. 支持向量机 (SVM) 模型
# 初始化SVM模型
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=10)

# 训练SVM模型
svm_model = svm.fit(train)

# 在测试集上进行预测
svm_predictions = svm_model.transform(test)

# 评估SVM模型性能
svm_accuracy = evaluator.evaluate(svm_predictions)
print(f"SVM Accuracy with Binary Labels: {svm_accuracy:.4f}")

SVM Accuracy with Binary Labels: 0.5915


In [52]:
# 计算SVM模型的混淆矩阵
svm_conf_matrix = svm_predictions.groupBy("label", "prediction").count()
svm_conf_matrix.show()

# 计算混淆矩阵的四个核心值
tp_svm = svm_conf_matrix.filter((col("label") == 1) & (col("prediction") == 1)).select(F.col("count")).collect()[0][0]
tn_svm = svm_conf_matrix.filter((col("label") == 0) & (col("prediction") == 0)).select(F.col("count")).collect()[0][0]
fp_svm = svm_conf_matrix.filter((col("label") == 0) & (col("prediction") == 1)).select(F.col("count")).collect()[0][0]
fn_svm = svm_conf_matrix.filter((col("label") == 1) & (col("prediction") == 0)).select(F.col("count")).collect()[0][0]

# 打印SVM混淆矩阵
print(f"Confusion Matrix for SVM:\nTP: {tp_svm}, TN: {tn_svm}, FP: {fp_svm}, FN: {fn_svm}")

# 计算SVM的TPR和FPR
tpr_svm = tp_svm / (tp_svm + fn_svm)  # True Positive Rate (Sensitivity/Recall)
fpr_svm = fp_svm / (fp_svm + tn_svm)  # False Positive Rate

# 输出SVM的TPR和FPR
print(f"SVM - True Positive Rate (TPR): {tpr_svm:.4f}")
print(f"SVM - False Positive Rate (FPR): {fpr_svm:.4f}")

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 4599|
|    0|       0.0| 6904|
|    1|       1.0| 4920|
|    0|       1.0| 3566|
+-----+----------+-----+

Confusion Matrix for SVM:
TP: 4920, TN: 6904, FP: 3566, FN: 4599
SVM - True Positive Rate (TPR): 0.5169
SVM - False Positive Rate (FPR): 0.3406


In [None]:
spark.stop()