<a href="https://colab.research.google.com/gist/Uf4i/a33a200a8043d02c031ed21844378962/fyp-enriched-data-ml-training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install PySpark
!pip install -q pyspark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HybridRecommender") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

print(" Spark session started successfully!")


 Spark session started successfully!


In [3]:
spark.range(5).show()




+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
ratings_path = "/content/drive/MyDrive/MovieLens/ratings.csv"
ratings_df = spark.read.csv(ratings_path, header=True, inferSchema=True)

ratings_df.show(5)
ratings_df.printSchema()


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
+------+-------+------+---------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



Als collabrative filtering model

In [6]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=False,
    maxIter=10,
    regParam=0.1,
    rank=10
)

model = als.fit(ratings_df)

# Recommend top 5 movies per user
user_recs = model.recommendForAllUsers(5)
user_recs.show(5, truncate=False)


+------+-------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                        |
+------+-------------------------------------------------------------------------------------------------------+
|1     |[{194434, 5.5407915}, {227066, 5.4236727}, {270306, 5.418744}, {240070, 5.418744}, {240054, 5.418744}] |
|3     |[{274047, 6.990751}, {222368, 6.356542}, {226196, 6.066877}, {178501, 5.698645}, {229947, 5.5583177}]  |
|5     |[{222368, 6.263216}, {166641, 5.666271}, {225435, 5.6541667}, {205277, 5.4591346}, {209193, 5.4453998}]|
|6     |[{222368, 7.7740064}, {225435, 7.676825}, {214910, 7.401993}, {287975, 7.3870044}, {166641, 7.088755}] |
|9     |[{151989, 6.7152753}, {222368, 6.5070086}, {270306, 6.494844}, {240070, 6.494844}, {240054, 6.494844}] |
+------+----------------------------------------------------------------------------------------

convert als to pandas

In [7]:
user_recs_pd = user_recs.toPandas()


In [8]:
import pandas as pd

# Flatten ALS output
flattened_recs = []

for _, row in user_recs_pd.iterrows():
    user_id = row['userId']
    for rec in row['recommendations']:
        flattened_recs.append({
            'userId': user_id,
            'movieId': rec['movieId'],
            'predicted_rating': rec['rating']
        })

als_flat_df = pd.DataFrame(flattened_recs)
als_flat_df.head()


Unnamed: 0,userId,movieId,predicted_rating
0,1,194434,5.540792
1,1,227066,5.423673
2,1,270306,5.418744
3,1,240070,5.418744
4,1,240054,5.418744


In [9]:
movies_path = "/content/drive/MyDrive/MovieLens/movies_tmdbMetadata.csv"
movies_df = pd.read_csv(movies_path)

print(movies_df.columns)


Index(['movieId', 'title', 'genres', 'tmdb_id', 'poster_url', 'trailer_key',
       'trailer_url'],
      dtype='object')


merge als + movie metada

In [10]:
hybrid_df = als_flat_df.merge(movies_df, on="movieId", how="left")
hybrid_df.head()


Unnamed: 0,userId,movieId,predicted_rating,title,genres,tmdb_id,poster_url,trailer_key,trailer_url
0,1,194434,5.540792,Adrenaline (1990),(no genres listed),171272.0,https://image.tmdb.org/t/p/w500/1uZ1dVfutxL25s...,,
1,1,227066,5.423673,Friendly Fire (2006),Drama,529266.0,https://image.tmdb.org/t/p/w500/kABqMHdaj6mPLW...,-Aam3aUqeTo,https://www.youtube.com/watch?v=-Aam3aUqeTo
2,1,270306,5.418744,WWE: The Triumph and Tragedy of World Class Ch...,Action|Documentary,,,,
3,1,240070,5.418744,SpongeBob SquarePants: Heroes of Bikini Bottom...,Animation,299712.0,https://image.tmdb.org/t/p/w500/3dBknjxDj5KUPg...,,
4,1,240054,5.418744,SpongeBob SquarePants: Tide and Seek,Animation|Comedy,,,,


In [11]:
hybrid_df.to_csv("/content/drive/MyDrive/MovieLens/hybrid_recommendations.csv", index=False)
print("Hybrid recommendations saved to Drive.")


Hybrid recommendations saved to Drive.
