In [23]:
pyspark.__version__

'2.4.4'

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession \
    .builder \
    .appName("feat-eng") \
    .getOrCreate()

In [4]:
anime_df = spark.read.csv('../dataset/anime.csv', header=True, inferSchema=True)
anime_df.printSchema()

root
 |-- anime_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- members: integer (nullable = true)



In [5]:
from pyspark.ml.feature import StringIndexer


In [6]:
genre_df = anime_df \
    .withColumn('genre_item', explode(split(col('genre'), '[,]'))) \
    .withColumn('genre_item', trim(col('genre_item')))

In [7]:
genre_df.select(['anime_id', 'genre_item']).show(10)


+--------+------------+
|anime_id|  genre_item|
+--------+------------+
|   32281|       Drama|
|   32281|     Romance|
|   32281|      School|
|   32281|Supernatural|
|    5114|      Action|
|    5114|   Adventure|
|    5114|       Drama|
|    5114|     Fantasy|
|    5114|       Magic|
|    5114|    Military|
+--------+------------+
only showing top 10 rows



In [8]:
string_indexer = StringIndexer(inputCol='genre_item', outputCol='genre_index')
genre_indexed_df = string_indexer \
    .fit(genre_df) \
    .transform(genre_df) \
    .withColumn('genre_index', col('genre_index').cast('int'))

In [9]:
genre_indexed_df.show()

+--------+--------------------+--------------------+-----+--------+------+-------+------------+-----------+
|anime_id|                name|               genre| type|episodes|rating|members|  genre_item|genre_index|
+--------+--------------------+--------------------+-----+--------+------+-------+------------+-----------+
|   32281|      Kimi no Na wa.|Drama, Romance, S...|Movie|       1|  9.37| 200630|       Drama|          5|
|   32281|      Kimi no Na wa.|Drama, Romance, S...|Movie|       1|  9.37| 200630|     Romance|          8|
|   32281|      Kimi no Na wa.|Drama, Romance, S...|Movie|       1|  9.37| 200630|      School|          9|
|   32281|      Kimi no Na wa.|Drama, Romance, S...|Movie|       1|  9.37| 200630|Supernatural|         12|
|    5114|Fullmetal Alchemi...|Action, Adventure...|   TV|      64|  9.26| 793665|      Action|          1|
|    5114|Fullmetal Alchemi...|Action, Adventure...|   TV|      64|  9.26| 793665|   Adventure|          2|
|    5114|Fullmetal Alchemi.

In [10]:
pre_multihot_df = genre_indexed_df \
    .groupby('anime_id') \
    .agg(collect_list('genre_index').alias('genre_indexes'))

In [11]:
pre_multihot_df.show(10)


+--------+--------------------+
|anime_id|       genre_indexes|
+--------+--------------------+
|     148|          [5, 8, 10]|
|     463| [1, 2, 0, 3, 6, 22]|
|     471|[0, 5, 27, 8, 9, 10]|
|     496|    [2, 5, 3, 15, 6]|
|     833|    [1, 0, 13, 4, 6]|
|    1088|[1, 13, 23, 14, 8...|
|    1238|        [2, 0, 3, 6]|
|    1342|          [1, 5, 26]|
|    1580|      [2, 28, 6, 12]|
|    1591|           [0, 8, 9]|
+--------+--------------------+
only showing top 10 rows



In [12]:
max_genre_index = genre_indexed_df \
    .agg(max(col('genre_index'))).head()['max(genre_index)']

In [13]:
max_genre_index


42

In [14]:
import numpy as np

@udf(returnType='array<int>')
def multihot_list(l, max_index):
    fill = np.zeros(max_index + 1, dtype=np.int32)
    for i in l:
        fill[i] = 1
    return fill.tolist()

In [15]:
multihot_df = pre_multihot_df \
    .withColumn(
        'genre_multihot',
        multihot_list(col('genre_indexes'), lit(max_genre_index))
    )


In [16]:
multihot_df.printSchema()


root
 |-- anime_id: integer (nullable = true)
 |-- genre_indexes: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- genre_multihot: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [17]:
multihot_df.show(10)


+--------+--------------------+--------------------+
|anime_id|       genre_indexes|      genre_multihot|
+--------+--------------------+--------------------+
|     148|          [5, 8, 10]|[0, 0, 0, 0, 0, 1...|
|     463| [1, 2, 0, 3, 6, 22]|[1, 1, 1, 1, 0, 0...|
|     471|[0, 5, 27, 8, 9, 10]|[1, 0, 0, 0, 0, 1...|
|     496|    [2, 5, 3, 15, 6]|[0, 0, 1, 1, 0, 1...|
|     833|    [1, 0, 13, 4, 6]|[1, 1, 0, 0, 1, 0...|
|    1088|[1, 13, 23, 14, 8...|[0, 1, 0, 0, 1, 0...|
|    1238|        [2, 0, 3, 6]|[1, 0, 1, 1, 0, 0...|
|    1342|          [1, 5, 26]|[0, 1, 0, 0, 0, 1...|
|    1580|      [2, 28, 6, 12]|[0, 0, 1, 0, 0, 0...|
|    1591|           [0, 8, 9]|[1, 0, 0, 0, 0, 0...|
+--------+--------------------+--------------------+
only showing top 10 rows



### Average Rating Min-Max Scale


In [18]:
rating_df = spark.read.csv('../dataset/rating.csv', header=True, inferSchema=True) \
    .filter(col('rating') > 0)

In [19]:
ave_rating_df = rating_df \
    .groupby('anime_id') \
    .agg(mean('rating').alias('ave_rating'))

In [20]:
ave_rating_df.show(10)


+--------+-----------------+
|anime_id|       ave_rating|
+--------+-----------------+
|   24171|7.386666666666667|
|    9465|8.098352214212152|
|   17679|7.293103448275862|
|    1829|7.341757827235005|
|    8086|7.939071817474721|
|   17389|8.601839684625492|
|   22097| 8.13076923076923|
|   30654|8.687342833193629|
|    5300|8.694010416666666|
|    6336|8.497902097902099|
+--------+-----------------+
only showing top 10 rows



In [21]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline

In [22]:
vec_assembler = VectorAssembler(inputCols=['ave_rating'], outputCol='ave_rating_vec')
ave_rating_scaler = MinMaxScaler(inputCol='ave_rating_vec', outputCol='ave_rating_scaled')
pipeline = Pipeline(stages=[vec_assembler, ave_rating_scaler])

rating_scaled_df = pipeline \
    .fit(ave_rating_df) \
    .transform(ave_rating_df)

In [23]:
rating_scaled_df.printSchema()


root
 |-- anime_id: integer (nullable = true)
 |-- ave_rating: double (nullable = true)
 |-- ave_rating_vec: vector (nullable = true)
 |-- ave_rating_scaled: vector (nullable = true)



In [24]:
rating_scaled_df.show(10)


+--------+-----------------+-------------------+--------------------+
|anime_id|       ave_rating|     ave_rating_vec|   ave_rating_scaled|
+--------+-----------------+-------------------+--------------------+
|   24171|7.386666666666667|[7.386666666666667]|[0.7096296296296296]|
|    9465|8.098352214212152|[8.098352214212152]|[0.7887058015791281]|
|   17679|7.293103448275862|[7.293103448275862]|[0.6992337164750958]|
|    1829|7.341757827235005|[7.341757827235005]|[0.7046397585816673]|
|    8086|7.939071817474721|[7.939071817474721]|[0.7710079797194135]|
|   17389|8.601839684625492|[8.601839684625492]| [0.844648853847277]|
|   22097| 8.13076923076923| [8.13076923076923]|[0.7923076923076923]|
|   30654|8.687342833193629|[8.687342833193629]| [0.854149203688181]|
|    5300|8.694010416666666|[8.694010416666666]|[0.8548900462962963]|
|    6336|8.497902097902099|[8.497902097902099]|[0.8331002331002332]|
+--------+-----------------+-------------------+--------------------+
only showing top 10 

In [25]:
@udf(returnType='float')
def unwrap_list(rating):
    return rating.toArray().tolist()[0]

In [26]:
rating_scaled_df = rating_scaled_df \
    .withColumn('ave_rating_minmax', unwrap_list(col('ave_rating_scaled')))

In [27]:
rating_scaled_df.show(10)


+--------+-----------------+-------------------+--------------------+-----------------+
|anime_id|       ave_rating|     ave_rating_vec|   ave_rating_scaled|ave_rating_minmax|
+--------+-----------------+-------------------+--------------------+-----------------+
|   24171|7.386666666666667|[7.386666666666667]|[0.7096296296296296]|       0.70962965|
|    9465|8.098352214212152|[8.098352214212152]|[0.7887058015791281]|        0.7887058|
|   17679|7.293103448275862|[7.293103448275862]|[0.6992337164750958]|        0.6992337|
|    1829|7.341757827235005|[7.341757827235005]|[0.7046397585816673]|       0.70463973|
|    8086|7.939071817474721|[7.939071817474721]|[0.7710079797194135]|       0.77100796|
|   17389|8.601839684625492|[8.601839684625492]| [0.844648853847277]|       0.84464884|
|   22097| 8.13076923076923| [8.13076923076923]|[0.7923076923076923]|        0.7923077|
|   30654|8.687342833193629|[8.687342833193629]| [0.854149203688181]|        0.8541492|
|    5300|8.694010416666666|[8.6

In [28]:
rating_result_df = rating_scaled_df \
    .select(['anime_id', 'ave_rating_minmax'])

In [29]:
result_df = anime_df \
    .join(rating_result_df, on='anime_id')

In [30]:
result_df.printSchema()


root
 |-- anime_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- episodes: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- members: integer (nullable = true)
 |-- ave_rating_minmax: float (nullable = true)



In [31]:
result_df.show(10)


+--------+--------------------+--------------------+-----+--------+------+-------+-----------------+
|anime_id|                name|               genre| type|episodes|rating|members|ave_rating_minmax|
+--------+--------------------+--------------------+-----+--------+------+-------+-----------------+
|   24171|     Mushibugyou OVA|Action, Fantasy, ...|  OVA|       3|   7.2|   3636|       0.70962965|
|    9465|Break Blade 4: Sa...|Action, Fantasy, ...|Movie|       1|  7.99|  41598|        0.7887058|
|   17679|               Gambo|  Demons, Historical|Movie|       1|  6.78|   4232|        0.6992337|
|    1829|          Gedo Senki|Adventure, Fantas...|Movie|       1|  7.18|  59243|       0.70463973|
|    8086|Densetsu no Yuush...|Action, Adventure...|   TV|      24|  7.83| 130689|       0.77100796|
|   17389|  Kingdom 2nd Season|Action, Historica...|   TV|      39|  8.57|  31234|       0.84464884|
|   22097|Magi: Sinbad no B...|Action, Adventure...|  OVA|       5|  8.06|  52351|        0

In [None]:
result_df \
    .write.format('csv') \
    .option('header', true) \
    .save('')