In [2]:
import findspark

In [3]:
findspark.init("/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7")

### FIRST SIMPLE PRACTICE

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recom').getOrCreate()

In [None]:
df = spark.read.csv("movielens_ratings.csv", inferSchema=True, header=True)

In [None]:
df.show()

In [None]:
df.describe().show()

In [None]:
df.printSchema()

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", ratingCol="rating", itemCol="movieId")

In [None]:
train, test = df.randomSplit([0.8, 0.2])

In [None]:
model = als.fit(train)

In [None]:
result = model.transform(test)

In [None]:
result.show()

In [None]:
regEvaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [None]:
regEvaluator.evaluate(result) 

In [None]:
user_11 = df.filter(df['userId']==11).select(['movieId', 'userId'])

In [None]:
recommendation = model.transform(user_11)

In [None]:
recommendation.orderBy('prediction', ascending=False).show()

In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("Recom")
sc = SparkContext(conf = conf)

In [24]:

# This function just creates a Python "dictionary" we can later
# use to convert movie ID's to movie names while printing out
# the final results.
def loadMovieNames():
    movieNames = {}
    try:
        with open("./ml-100k/u.item") as f:
            for line in f:
                fields = line.split('|')
                movieNames[int(fields[0])] = fields[1]
    except:
        print("Problem loading some shares of data")
    return movieNames

# Take each line of u.data and convert it to (movieID, (rating, 1.0))
# This way we can then add up all the ratings for each movie, and
# the total number of ratings for each movie (which lets us compute the average)
def parseInput(line):
    fields = line.split()
    return (int(fields[1]), (float(fields[2]), 1.0))

if __name__ == "__main__":
    # The main script - create our SparkContext


    # Load up our movie ID -> movie name lookup table
    movieNames = loadMovieNames()
    print(movieNames[669])

    # Load up the raw u.data file
    lines = sc.textFile("./ml-100k/u.data")

    # Convert to (movieID, (rating, 1.0))
    movieRatings = lines.map(parseInput)
#     print(movieRatings.collect())

    # Reduce to (movieID, (sumOfRatings, totalRatings))
    ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )

    # Filter out movies rated 10 or fewer times
    popularTotalsAndCount = ratingTotalsAndCount.filter(lambda x: x[1][1] > 10)
    
    

    # Map to (rating, averageRating)
    averageRatings = popularTotalsAndCount.mapValues(lambda totalAndCount : totalAndCount[0] / totalAndCount[1])
    
    
    # Sort by average rating
    sortedMovies = averageRatings.sortBy(lambda x: x[1])

    # Take the top 10 results
    results = sortedMovies.take(10)
    print(results)

    # Print them out:
    for result in results:
        try:
            print(movieNames[result[0]], result[1])
        except:
            print("Error printing the value ...")

Problem loading some shares of data


KeyError: 669

### New More Challenging Dataset

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recom').getOrCreate()


In [None]:
from pyspark.sql import Row

In [None]:
data = spark.read.text("./ml-100k/u.data").rdd

In [None]:
df = spark.read.text("./ml-100k/u.item").rdd
df.collect()

In [None]:
def parse_date(line):
    data = line.value.split()
    return Row(userID=int(data[0]), movieID=int(data[1]), rating=float(data[2]))

In [None]:
ratingData = data.map(parse_date)

In [None]:
ratingData.collect()

In [None]:
newDF = ratingData.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )

In [None]:
# Filter out movies rated 10 or fewer times
popularTotalsAndCount = newDF.filter(lambda x: x[1][1] > 10)


# Map to (rating, averageRating)
averageRatings = popularTotalsAndCount.mapValues(lambda totalAndCount : totalAndCount[0] / totalAndCount[1])

# Sort by average rating
sortedMovies = averageRatings.sortBy(lambda x: x[1])

for result in sortedMovies:
    print(result[0], result[1])

# # Take the top 10 results
# results = sortedMovies.take(10)

# Print them out:
# for result in results:
#     print(movieNames[result[0]], result[1])

In [None]:
Data_df_none_cache = spark.createDataFrame(ratingData)


In [None]:
Data_df = spark.createDataFrame(ratingData).cache()
Data_df.collect()

In [None]:
Data_df_none_cache = spark.createDataFrame(ratingData)
Data_df_none_cache.show()

In [None]:
from pyspark.ml.recommendation import ALS

In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="userID", ratingCol="rating", itemCol="movieID")

In [None]:
fittedDF = als.fit(Data_df)