<a href="https://colab.research.google.com/github/io-uty/2024-spark/blob/main/io-uty/movieDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from collections import Counter

In [2]:
ls

[0m[01;34msample_data[0m/


In [3]:
pwd

'/content'

In [4]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [6]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [48]:
dfUser = spark.read.csv('user.csv', header = True, sep = ",")
dfRatings = spark.read.csv('ratings.csv', header = True, sep = ",")
dfMovie = spark.read.csv('movie.csv', header=True, sep=",")
dfMovieGenres = spark.read.csv('movie_genres.csv', header = True, sep = ",")\
        .dropDuplicates(['movieId', 'genre'])
dfMovieGenres.show()

+--------+-------+----------+
|mgenreId|movieId|     genre|
+--------+-------+----------+
|     389|      1| Animation|
|     431|      1|Children's|
|     553|      1|    Comedy|
|    1223|     10|     Drama|
|    2796|     10|       War|
|    1070|    100|     Crime|
|    1264|    100|     Drama|
|    2568|    100|  Thriller|
|     855|   1000|    Comedy|
|    2887|   1000|   Western|
|     856|   1001|    Comedy|
|     857|   1002|    Comedy|
|     519|   1003|Children's|
|     858|   1003|    Comedy|
|    2183|   1003|   Mystery|
|    1602|   1004|     Drama|
|    2888|   1004|   Western|
|    1603|   1005|     Drama|
|    1604|   1006|     Drama|
|    2530|   1006|    Sci-Fi|
+--------+-------+----------+
only showing top 20 rows



# **분석1: 특정 사용자 그룹의 영화 선호도 분석**



In [49]:
from pyspark.sql.functions import col
FemaleUsers=dfUser.filter(col('gender') == 'F')\
        .drop('occupation')\
        .drop('zip code')\
        .filter(col('age')>=20)\
        .filter(col('age')<=30)\
        .withColumnRenamed('user id', 'userId')
FemaleUsers.show()

+------+---+------+
|userId|age|gender|
+------+---+------+
|    12| 28|     F|
|    23| 30|     F|
|    24| 21|     F|
|    32| 28|     F|
|    35| 20|     F|
|    38| 28|     F|
|    43| 29|     F|
|    46| 27|     F|
|    49| 23|     F|
|    62| 27|     F|
|    96| 25|     F|
|   126| 28|     F|
|   128| 24|     F|
|   140| 30|     F|
|   150| 20|     F|
|   159| 23|     F|
|   165| 20|     F|
|   174| 30|     F|
|   175| 26|     F|
|   180| 22|     F|
+------+---+------+
only showing top 20 rows



In [77]:
userRating=FemaleUsers.join(dfRatings, FemaleUsers.userId == dfRatings.userId, 'inner')\
        .select(FemaleUsers['userId'],dfRatings['movieId'],dfRatings['rating'])\
        .drop('userId')
# Perform a join operation to bring the 'genre' column into the 'userRating' DataFrame
userRatingWithGenre = userRating.join(dfMovieGenres, userRating.movieId == dfMovieGenres.movieId, 'inner') \
    .select(userRating['movieId'], dfMovieGenres['genre'], userRating['rating'])\
    .drop('movieId')

userRatingWithGenre.show()

+----------+------+
|     genre|rating|
+----------+------+
|Children's|     5|
| Adventure|     5|
|   Romance|     5|
|    Comedy|     2|
| Adventure|     2|
|    Action|     2|
|    Sci-Fi|     2|
| Animation|     5|
|    Comedy|     5|
|Children's|     5|
|   Musical|     5|
|       War|     2|
|     Drama|     2|
|    Action|     4|
|   Romance|     4|
|       War|     4|
| Adventure|     4|
|     Drama|     3|
|    Comedy|     3|
|    Comedy|     3|
+----------+------+
only showing top 20 rows



In [101]:
count = dfMovieGenres.groupBy('genre').count()
movierate=userRatingWithGenre
genrecount = movierate.groupBy('genre','rating').count()\
      .orderBy('genre','rating')


In [102]:
from pyspark.sql.functions import col, sum as _sum
movieCount = genrecount.groupBy('genre').agg(_sum('count').alias('movieCount'))

genrecount = genrecount.withColumn('ratings', col('rating')*col('count'))\
        .join(movieCount, 'genre', 'inner')
ratingCount = genrecount.groupBy('genre').agg(_sum('ratings').alias('ratingCount'))
genrecount = genrecount.join(ratingCount, 'genre', 'inner')\
        .withColumn('averageRating', col('ratingCount')/col('movieCount'))

In [104]:
genrecount = genrecount.drop('rating')\
        .drop('count')\
        .drop('ratings')\
        .drop('ratingCount')\
        .dropDuplicates(['genre'])\
        .orderBy('averageRating', ascending = False)

In [105]:
genrecount.show()

+-----------+----------+------------------+
|      genre|movieCount|     averageRating|
+-----------+----------+------------------+
|        War|       850|3.6941176470588237|
|    Romance|      2439|3.5953259532595325|
|    Musical|       638|3.5940438871473352|
|  Film-Noir|       134| 3.582089552238806|
|      Drama|      4392| 3.566256830601093|
|  Animation|       497| 3.545271629778672|
|      Crime|       776| 3.506443298969072|
|Documentary|        69| 3.463768115942029|
|    Mystery|       489|3.4519427402862988|
|  Adventure|      1402| 3.412981455064194|
|   Thriller|      2232|3.4094982078853047|
|     Sci-Fi|      1204|3.3903654485049834|
| Children's|      1019| 3.379784102060844|
|     Action|      2452| 3.372756933115824|
|     Comedy|      3523|3.3624751632131704|
|    Western|       156| 3.217948717948718|
|     Horror|       571|3.1453590192644483|
|    Fantasy|       165|3.0424242424242425|
|    unknown|         1|               3.0|
+-----------+----------+--------

# **분석2: 고평점 영화와 저평점 영화의 특징 비교**


In [136]:
from pyspark.sql.functions import *
dfRatings = spark.read.csv('ratings.csv', header = True, sep = ",")
dfMovieGenres = spark.read.csv('movie_genres.csv', header = True, sep = ",")\
        .dropDuplicates(['movieId', 'genre'])

movieRatings = dfRatings.groupBy('movieId')\
        .agg(avg("rating").alias("avgRating"))

movieRatings.join(dfMovieGenres, movieRatings.movieId == dfMovieGenres.movieId)\
        .groupBy('genre')\
        .agg(sum(when(col('avgRating')>=4.0,1).otherwise(0)).alias('highRatedCount'),
                 sum(when(col('avgRating')<2.0,1).otherwise(0)).alias('lowRatedCount'))\
        .orderBy(col('highRatedCount'), ascending = False)\
        .orderBy(col('lowRatedCount'), ascending = False)

genre,highRatedCount,lowRatedCount
Drama,84,49
Comedy,36,37
Action,16,18
Thriller,33,13
Horror,3,13
Romance,27,9
Children's,2,8
Documentary,12,6
Sci-Fi,8,5
Adventure,15,4


# **분석3: 사용자 활동 분석**

In [168]:
dfRatings = spark.read.csv('ratings.csv', header = True, sep = ",")
ratingCount = dfRatings.groupBy('userId').agg(sum(when(col('rating')>=1,1).otherwise(0)).alias('ratingCount'))
dfRatings.groupBy('userId')\
  .agg(avg('rating').alias('averageRating'))\
  .join(ratingCount, 'userId', 'inner')\
  .orderBy('ratingCount', ascending = False)\
  .show()

+------+------------------+-----------+
|userId|     averageRating|ratingCount|
+------+------------------+-----------+
|   405|1.8344640434192674|        737|
|   655| 2.908029197080292|        685|
|    13|  3.09748427672956|        636|
|   450|3.8648148148148147|        540|
|   276| 3.465250965250965|        518|
|   416| 3.845841784989858|        493|
|   537|2.8653061224489798|        490|
|   303| 3.365702479338843|        484|
|   234| 3.122916666666667|        480|
|   393|3.3370535714285716|        448|
|   181|1.4919540229885058|        435|
|   279|3.2672811059907834|        434|
|   429| 3.393719806763285|        414|
|   846| 3.740740740740741|        405|
|     7| 3.965260545905707|        403|
|    94|            3.6575|        400|
|   682| 3.137844611528822|        399|
|   308|3.7581863979848866|        397|
|   293|3.0309278350515463|        388|
|    92|3.2448453608247423|        388|
+------+------------------+-----------+
only showing top 20 rows



# **분석4: 특정 장르의 영화들에 대한 평점과 사용자 연령대 분석**

In [170]:
dfRatings = spark.read.csv('ratings.csv', header = True, sep = ",")
dfUser = spark.read.csv('user.csv', header = True, sep = ",")
dfMovieGenres = spark.read.csv('movie_genres.csv', header = True, sep = ",")\
        .dropDuplicates(['movieId', 'genre'])\
        .filter(col('genre')=='Action')

In [180]:
from pyspark.sql.functions import avg
dfUser = dfUser.withColumnRenamed('user id', 'userId')

dfUser = dfUser.drop('gender')\
        .drop('occupation')\
        .drop('zip code')\
        .join(dfRatings, dfUser.userId == dfRatings.userId)\
        .select(dfUser['userId'], dfUser['age'], dfRatings['movieId'], dfRatings['rating'])\
        .drop('userId')\
        .join(dfMovieGenres, dfRatings.movieId == dfMovieGenres.movieId, 'inner')\
        .select(dfUser['age'], dfMovieGenres['genre'], dfRatings['rating'])\
        .drop('movieId')\
        .drop('genre')\

dfUser.groupBy('age')\
        .agg(avg('rating').alias('averageRating'))\
        .orderBy('age', ascending = True)\
        .show()

+---+------------------+
|age|     averageRating|
+---+------------------+
| 10|               4.0|
| 11| 3.076923076923077|
| 13|3.4934210526315788|
| 14| 3.593220338983051|
| 15|               3.5|
| 16|3.5217391304347827|
| 17| 3.651006711409396|
| 18|3.7417677642980935|
| 19| 3.307101727447217|
| 20| 3.696747967479675|
| 21|3.4532760472610096|
| 22|3.2726432532347505|
| 23| 3.193243243243243|
| 24|3.4625199362041466|
| 25|3.5377952755905513|
| 26| 3.243414120126449|
| 27|3.4154676258992804|
| 28|3.4402907580477673|
| 29| 3.422773393461105|
| 30| 3.348643006263048|
+---+------------------+
only showing top 20 rows

