<h1 align="center">BDCC project 1</h1>

<h4 align="center">By: António Almeida, Miguel Ramalho</h4>

<h5 align="center"><a href="http://www.dcc.fc.up.pt/~edrdo/aulas/bdcc">Big Data and Cloud Computing</a>, DCC/FCUP</h5>


## Code necessary to run from the command line 

In [1]:
if __name__ == "__main__" :
    # This block is required to run the program from the command line in interface with a single Spark instance
    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    
    spark = SparkSession.builder.appName("BDCCp1").master("local[*]").getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel("WARN")

## Provided code - auxilliary functions

__You should not need to edit these.__

#### loadMovieLensData

In [1]:
from pyspark.sql import functions as F

In [2]:
def readCSV(file, debug=False):
    if debug: print('Reading ' + file)
    return spark.read.csv(file, inferSchema=True, header=True)

def readParquet(file, debug=False): 
    if debug: print('Reading ' + file)
    return spark.read.parquet(file)

def loadMovieLensData(path, format='parquet', debug=False):
    if format == 'parquet':
        movies = readParquet(path +'/movies.parquet', debug)
        ratings = readParquet(path +'/ratings.parquet', debug)
        tags = readParquet(path +'/tags.parquet', debug)
    else:
        movies = readCSV(path +'/movies.csv', debug)
        ratings = readCSV(path +'/ratings.csv', debug)
        tags = readCSV(path +'/tags.csv', debug)
    
    tags = tags.withColumn('tagl', F.explode(F.split(F.lower(F.col('tag')),'[ \*\+\&\/\%\-\$\#\'\)\(\[\[\],.!?;:\t\n"]+')))\
            .drop('tag')\
            .withColumnRenamed('tagl','tag')
    if (debug):
        print('> movies')
        movies.printSchema()
        movies.show()
        
        print('> ratings')
        ratings.printSchema()
        ratings.show()
        
        print('> tags')
        tags.printSchema()
        tags.show()
    return (movies, ratings, tags)

#### writeCSV / writeParquet (use them to write a data frame to CSV or Parquet format)

In [3]:
def writeCSV(df, path): 
    df.write.csv(path, header=True, mode='overwrite')

def writeParquet(df,path):
    df.write.parquet(path, mode='overwrite')

#### createTagListDF

In [4]:
def createTagListDF(csvTagList):
    # receives a string of space-separated tags and returns them in a dataframe
    return spark.createDataFrame([ (t,) for t in csvTagList.split(' ')], ['tag'])

#### Definition of functions available only in Spark 2.4 (GCP Spark instances run Spark 2.3) 

In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType,IntegerType

# Define F.array_intersect if not defined (Spark version < 2.4)
if not hasattr(F,'array_intersect'):
    F.array_intersect = spark.udf.register('array_intersect', 
        lambda x,y: list(set(x) & set(y)), ArrayType(IntegerType()))

# Define F.array_union if not defined (Spark version < 2.4)
if not hasattr(F,'array_union'):
    F.array_union = spark.udf.register('array_union', 
        lambda x,y: list(set(x) | set(y)), ArrayType(IntegerType()))

## Group code - auxilliary functions

These come before the functions to define due to jupyter top-down logic.

In [6]:
def get_idf(*, data, w='w', d='d', n=None, debug=False):
    """Calculates the Inverse Document Frequency (IDF) of a DataFrame

    By default, uses the standard usage of IDF, i.e., 'w'
    is a word in a document 'd'. If 'n' is set, it also 
    returns a column containing the number of documents
    in which word 'w' appears.

    Args:
        data: A DataFrame instance.
        w: Column name for 'words'
        d: Column name for 'documents'
        n: Name for the output column containing the
            number of documents in which 'w' appears

    Returns:
        A DataFrame with 'w', 'IDF, [n] as columns
    """
    n_w_D = data\
           .groupBy(w)\
           .agg(F.countDistinct(d).alias('n_w_D'))
    if debug: n_w_D.orderBy('n_w_D',ascending=False).show()
        
    size_of_D = data.select(d).distinct().count()
    if debug: print("|D| = %d" % size_of_D)
    
    IDF = n_w_D.withColumn('IDF', F.log2(size_of_D / F.col('n_W_D')))

    if n: IDF = IDF.withColumnRenamed('n_w_D', n)
    else: IDF = IDF.drop('n_w_D')
    return IDF

## Functions to define 

__This is the section that will be evaluated.__

__Include your code for the various functions required in the assigment below.__

__You may include other auxilliary functions required for computation here
but NOT test code (see below).__

> Some auxiliary functions required for computation are in the [section above](#Group-code---auxilliary-functions)

In [7]:
import string
from pyspark.sql.functions import udf

#### tfidfTags
Calculates the TF-IDF metric for tags in association to movies.

In [8]:
def tfidfTags(tags, debug=False):
    # f is the the number of times tag has been used in with movieId
    # aggregate as (tag, movieId, f)
    df_f = tags.groupBy('tag', 'movieId')\
               .agg(F.count('userId').alias('f'))
    
    # f_max is the maximum absolute frequency of any tag used for movieId
    # aggregate as (movieId, f_max)
    df_f_max = df_f.groupBy('movieId')\
                   .agg(F.max('f').alias('f_max'))
    
    # call external function to calculate IDF
    df_idf = get_idf(data=tags, w='tag', d='movieId', n='n', debug=debug)
    
    # join f_max on movieId, calculate TF, join with IDF on tag
    df = df_f.join(df_f_max, 'movieId')\
             .withColumn('TF', F.col('f') / F.col('f_max'))\
             .join(df_idf, 'tag')
    
    # return dataframe with TF_IDF
    return df.withColumn('TF_IDF', df.TF * df.IDF)

#### recommendByTag
Recomends movies that have the highest TF-IDF value for a given (single) tag.

In [9]:
def recommendByTag(singleTag, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    # start by most complexity-reducing operation: filter
    # join to get movie title
    # order by descending TFIDF + ascending lexicographic title
    # remove unnecessary columns
    # return results limited to numberOfResults
    df = TFIDF_tags.filter(TFIDF_tags.tag == singleTag)\
                   .filter(TFIDF_tags.f_max >= min_fmax)\
                   .join(movies, 'movieId')\
                   .orderBy(['TF_IDF','title'], ascending=[0,1])\
                   .select('movieId', 'title', 'TF_IDF')\
                   .limit(numberOfResults)
    return df

#### recommendByTags
Recomends movies that have the highest combined (sum of) TF-IDF value for several given tags (1 or more).

In [10]:
def recommendByTags(searchTags, TFIDF_tags, movies, min_fmax=10, numberOfResults=10, debug=False):
    df_search_tags = createTagListDF(searchTags)
    if debug:
        print('> Search tags DF: ' + searchTags)
        df_search_tags.show()
    # filter by min_fmax
    # join df_search_tags to remove unasked tags
    # group by movieId and aggregate on the SUM of tfidf
    # join movies to get title
    # order by descending SUM_TF_IDF + ascending lexicographic title
    # force column order in the examples
    # return results limited to numberOfResults
    return TFIDF_tags.filter(TFIDF_tags.f_max >= min_fmax)\
                     .join(df_search_tags, 'tag', 'inner')\
                     .groupBy('movieId')\
                     .agg(F.sum('TF_IDF').alias('SUM_TF_IDF'))\
                     .join(movies, 'movieId')\
                     .orderBy(['SUM_TF_IDF', 'title'], ascending=[0,1])\
                     .select(["movieId", "title", "SUM_TF_IDF"])\
                     .limit(numberOfResults)

#### jiMovieSimilarity
Calculates the Jaccard index to measure similarity between movies based on user ratings.

Linking a movie means `rating >= 4.0`

In [11]:
def jiMovieSimilarity(ratings, minRatings=10, debug=False):
    # get liked movies only, rating >= 4.0
    df_likes = ratings.filter("rating >= 4.0").drop("rating")
    if debug: df_likes.show(10)
        
    # filter movies with less than minRatings ratings
    # obtain set of users that LIKED a given movie
    df_m1 = df_likes.groupBy("movieId")\
                    .agg(F.collect_set(df_likes.userId).alias("u1"))\
                    .filter(F.size("u1") >= minRatings)\
                    .withColumnRenamed("movieId", "m1")
    if debug: df_m1.show(10)
        
    # duplicate dataframe for cross join
    df_m2 = df_m1.withColumnRenamed("m1", "m2")\
                 .withColumnRenamed("u1", "u2")
    if debug: df_m2.show(10)
        
    # product -> cross join to get m1, m2
    # count intersect and union of sets
    # calculate Jaccard Index
    # remove irrelevant columns
    return df_m1.crossJoin(df_m2)\
              .filter(df_m1.m1 < df_m2.m2)\
              .withColumn("i", F.size(F.array_intersect("u1", "u2")))\
              .withColumn("u", F.size(F.array_union("u1", "u2")))\
              .withColumn("JI", F.col("i")/F.col("u"))\
              .drop("u1", "u2")    

#### recommendBySimilarity
Given the id of a movie, recommend other movies that are similar to it.

In [12]:
# Auxiliary function
def filter_movie(movieId, jiForMovies, col_filter, col_rename):
    # filter (m1 or m2) in jiForMovies to match movieId,
    # the ones with with id > movieId
    # remove the filtered column and rename the other to movieId
    return jiForMovies.filter("%s == %d" % (col_filter, movieId))\
                      .drop(col_filter)\
                      .withColumnRenamed(col_rename, 'movieId')

In [13]:
def recommendBySimilarity(movieId, movies, jiForMovies, numberOfResults=10, debug=False):
    df_m1 = filter_movie(movieId, jiForMovies, "m1", "m2")
    df_m2 = filter_movie(movieId, jiForMovies, "m2", "m1")
    
    # all the movies that match movieId
    df = df_m1.union(df_m2)
    
    # join movieId with movies to get title
    # get only the relevant columns
    # order by descending JI
    # return results limited to numberOfResults
    return df.join(movies, 'movieId')\
             .select(["movieId", "title", "JI"])\
             .orderBy("JI", ascending=False)\
             .limit(numberOfResults)

# Extended functionality
Here, the students have defined a set of methods, based on what the teacher has proposed, along with some custom changes and methods that were considered challenging and relevant for the course.

#### tfidf_movies
A TF-IDF function for word-based movie recommendations that accounts both for user tags and individual words in movie titles. Words in titles can also be informative when looking for movie recomendations!

In [14]:
def tfidf_movies(movies, tags):
    # goal (movieId, word), word = tag + 
    df = movies

In [15]:
clean_punctuation = udf(lambda r: r.translate(str.maketrans('', '', string.punctuation)))
dfm = movies.withColumn("tag", F.explode(F.split(clean_punctuation(movies.title),' ')))\
            .withColumn("userId", -F.monotonically_increasing_id())\
            .drop("title").select(["movieId", "userId", "tag"])
dfm = dfm.union(tags).orderBy(["movieId", "userId"])
dfm.show()
tfidfTags(dfm, debug=False).show()

NameError: name 'movies' is not defined

#### tfidf_movies_recommendation

A function that uses a word-based TF-IDF value for movie recommendations that accounts both for user tags and individual words in movie titles.

#### ji_similarity_tags_movies

Calculate the Jaccard similarity between tags based on the films they are applied to. Then also implement a function that automatically suggests n tags for a given film m, e.g., the top n similar tags in addition to the tags already associated to the film.

In [41]:
def ji_similarity_tags_movies(tags, movies, debug=False):
    # remove unnecessary column userId
    # for each tag, get set of movies
    # rename to t1
    df_t1 = tags.drop('userId')\
            .groupBy('tag')\
            .agg(F.collect_set(tags.movieId).alias('m1'))\
            .withColumnRenamed('tag', 't1')
    if debug: df_t1.show()
    
    # duplicate dataset for crossjoin
    df_t2 = df_t1.withColumnRenamed('t1', 't2')\
                .withColumnRenamed('m1', 'm2')
    if debug: df_t2.show()
    
    # product -> cross join to get m1, m2
    # count intersect and union of sets
    # calculate Jaccard Index
    # remove irrelevant columns
    return df_t1.crossJoin(df_t2)\
                .filter(df_t1.t1 < df_t2.t2)\
                .withColumn('i', F.size(F.array_intersect('m1','m2')))\
                .withColumn('u', F.size(F.array_union('m1', 'm2')))\
                .withColumn('JI', F.col('i')/F.col('u'))\
                .drop('m1', 'm2')

In [64]:
def recommend_tags(m, movies, ji, n=4, debug=False):
    # get tags associated with movie 'm'
    # remove unnecessary column
    df_tags = tags.filter(tags.movieId == m)\
            .drop('userId', 'movieId')
    if debug: df_tags.show()

    # match the movie's tags with the ones
    # on the JI dataframe - creating two 
    # dataframes because we need to match
    # with columns t1 and t2
    df_t1 = df_tags.withColumnRenamed('tag','t1').join(ji, 't1').drop('t1').withColumnRenamed('t2','tag')
    df_t2 = df_tags.withColumnRenamed('tag','t2').join(ji, 't2').drop('t2').withColumnRenamed('t1','tag')

    if debug: df_t1.show()
    if debug: df_t2.show()
    
    # join the dataframes
    # order by JI
    # limit the number of results
    return df_t1.union(df_t2)\
            .orderBy(['JI','i','u'], ascending='[0, 0, 0]')\
            .limit(n)

# Specify input data set and load it

In [16]:
# Load data
bucket = 'gs://bdcc1819'
path = '/p1/data/'
dataset = 'tiny3'
fullPath = bucket + path + dataset

movies, ratings, tags = loadMovieLensData(fullPath, format='csv', debug=True)

Reading gs://bdcc1819/p1/data/tiny3/movies.csv
Reading gs://bdcc1819/p1/data/tiny3/ratings.csv
Reading gs://bdcc1819/p1/data/tiny3/tags.csv
> movies
root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
|     16|       Casino (1995)|
|     17|Sense and Sensibi...|
|     18|   Four Rooms (1995)|
|     19|Ace Ventura: When...|
|     20|  Money Train (1995)|
+-------+--------------------+
only showing top 20 rows

##  Test code 

__Include test code below that you may need here.__

__The initial contents are only meant as an example.__

__This section will NOT be evaluated.__

In [13]:
# Get TF-IDF for tags
tfidf = tfidfTags(tags, debug=False)
tfidf.cache()
# guarantee all columns are present
assert tfidf.columns == ['tag', 'movieId', 'f', 'f_max', 'TF', 'n', 'IDF', 'TF_IDF'],\
    "Columns do not match expected values for tfidfTags"
# preview the dataframe
tfidf.orderBy(['f','TF_IDF','movieId','tag'], ascending=[0,0,1,1]).show()

+-----------+-------+---+-----+------------------+---+-----------------+-----------------+
|        tag|movieId|  f|f_max|                TF|  n|              IDF|           TF_IDF|
+-----------+-------+---+-----+------------------+---+-----------------+-----------------+
|       time|     32|  3|    3|               1.0|  1|5.426264754702098|5.426264754702098|
|     travel|     32|  3|    3|               1.0|  1|5.426264754702098|5.426264754702098|
|      pixar|      1|  2|    2|               1.0|  1|5.426264754702098|5.426264754702098|
|       game|      2|  2|    2|               1.0|  1|5.426264754702098|5.426264754702098|
|apocalyptic|     32|  2|    3|0.6666666666666666|  1|5.426264754702098|3.617509836468065|
|       post|     32|  2|    3|0.6666666666666666|  1|5.426264754702098|3.617509836468065|
|      moldy|      3|  1|    1|               1.0|  1|5.426264754702098|5.426264754702098|
|        old|      3|  1|    1|               1.0|  1|5.426264754702098|5.426264754702098|

In [14]:
# Recommend by tag, tests for tiny3
recommendByTag('twist', tfidf, movies, min_fmax=1).show()
recommendByTag('killer', tfidf, movies, min_fmax=1).show()
recommendByTag('remake', tfidf, movies, min_fmax=1).show()

rm = recommendByTag('twist', tfidf, movies, min_fmax=1, numberOfResults=2)
assert rm.count() == 2, "numberOfResults should be 2"
assert rm.columns == ["movieId", "title", "TF_IDF"], "unexpected columns"

+-------+--------------------+-----------------+
|movieId|               title|           TF_IDF|
+-------+--------------------+-----------------+
|     47|Seven (a.k.a. Se7...|3.841302253980942|
|     50|Usual Suspects, T...|3.841302253980942|
|     32|Twelve Monkeys (a...|1.280434084660314|
+-------+--------------------+-----------------+

+-------+--------------------+-----------------+
|movieId|               title|           TF_IDF|
+-------+--------------------+-----------------+
|     22|      Copycat (1995)|4.426264754702098|
|     47|Seven (a.k.a. Se7...|4.426264754702098|
+-------+--------------------+-----------------+

+-------+--------------------+-----------------+
|movieId|               title|           TF_IDF|
+-------+--------------------+-----------------+
|      5|Father of the Bri...|3.841302253980942|
|      7|      Sabrina (1995)|3.841302253980942|
|     32|Twelve Monkeys (a...|1.280434084660314|
+-------+--------------------+-----------------+



In [15]:
# Recommend by tags, tests for tiny3
recommendByTags('jane austen', tfidf, movies,min_fmax=1).show()
recommendByTags('remake time twist', tfidf, movies, min_fmax=1).show()
recommendByTags('robin williams remake', tfidf, movies, min_fmax=1).show()

rm = recommendByTags('remake time twist', tfidf, movies, min_fmax=1, numberOfResults=4)
assert rm.count() == 4, "numberOfResults should be 4"
assert rm.columns == ["movieId", "title", "SUM_TF_IDF"], "unexpected columns"

# Tests for tiny 1 (?)
# recommendByTags('tom hanks airport', tfidf, movies, numberOfResults=20).show()
# recommendByTags('tom hanks', tfidf, movies, numberOfResults=20).show()
# recommendByTags('hitchcock birds', tfidf, movies, numberOfResults=10).show()

+-------+--------------------+-----------------+
|movieId|               title|       SUM_TF_IDF|
+-------+--------------------+-----------------+
|     39|     Clueless (1995)|7.682604507961884|
|     28|   Persuasion (1995)|7.682604507961884|
|     17|Sense and Sensibi...|7.682604507961884|
+-------+--------------------+-----------------+

+-------+--------------------+-----------------+
|movieId|               title|       SUM_TF_IDF|
+-------+--------------------+-----------------+
|     32|Twelve Monkeys (a...|7.987132924022726|
|      5|Father of the Bri...|3.841302253980942|
|      7|      Sabrina (1995)|3.841302253980942|
|     47|Seven (a.k.a. Se7...|3.841302253980942|
|     50|Usual Suspects, T...|3.841302253980942|
+-------+--------------------+-----------------+

+-------+--------------------+-----------------+
|movieId|               title|       SUM_TF_IDF|
+-------+--------------------+-----------------+
|      2|      Jumanji (1995)|5.426264754702098|
|      5|Father of

In [18]:
jiMovieSimilarity(ratings).orderBy('JI', ascending=False).show()

jiM = jiMovieSimilarity(ratings, debug=True)
assert jiM.columns == ["m1", "m2", "i", "u", "JI"], "unexpected columns"

+---+---+---+---+-------------------+
| m1| m2|  i|  u|                 JI|
+---+---+---+---+-------------------+
| 47| 50| 75|228|0.32894736842105265|
|  6| 16| 30| 96|             0.3125|
| 25| 36| 21| 68| 0.3088235294117647|
| 11| 62| 20| 67|0.29850746268656714|
| 47|110| 70|236| 0.2966101694915254|
| 32| 47| 54|204| 0.2647058823529412|
| 14| 52|  5| 19| 0.2631578947368421|
|110|150| 61|232| 0.2629310344827586|
| 32| 50| 57|224| 0.2544642857142857|
|  1| 34| 43|179|0.24022346368715083|
|  6| 32| 36|151|0.23841059602649006|
|  1| 32| 51|214| 0.2383177570093458|
| 50|110| 63|266|0.23684210526315788|
|  1|150| 52|222|0.23423423423423423|
|  6| 47| 39|170|0.22941176470588234|
| 17| 36| 16| 70|0.22857142857142856|
|  1| 50| 57|253|0.22529644268774704|
| 32|110| 52|232|0.22413793103448276|
| 14| 25| 10| 45| 0.2222222222222222|
| 47|111| 39|177|0.22033898305084745|
+---+---+---+---+-------------------+
only showing top 20 rows

+-------+------+
|movieId|userId|
+-------+------+
|      1|  

In [24]:
jiM.cache()
recommendBySimilarity(6, movies, jiM).show()# Heat
recommendBySimilarity(14, movies, jiM).show() # Nixon
recommendBySimilarity(25, movies, jiM).show() # Leaving Las Vegas

rm = recommendBySimilarity(6, movies, jiM)
assert rm.columns == ["movieId", "title", "JI"], "unexpected columns"

+-------+--------------------+-------------------+
|movieId|               title|                 JI|
+-------+--------------------+-------------------+
|     16|       Casino (1995)|             0.3125|
|     32|Twelve Monkeys (a...|0.23841059602649006|
|     47|Seven (a.k.a. Se7...|0.22941176470588234|
|    110|   Braveheart (1995)|0.21761658031088082|
|     25|Leaving Las Vegas...|0.21739130434782608|
|     50|Usual Suspects, T...| 0.1836734693877551|
|    111|  Taxi Driver (1976)| 0.1693548387096774|
|     70|From Dusk Till Da...|0.16666666666666666|
|     36|Dead Man Walking ...|               0.15|
|      1|    Toy Story (1995)|0.14285714285714285|
+-------+--------------------+-------------------+

+-------+--------------------+-------------------+
|movieId|               title|                 JI|
+-------+--------------------+-------------------+
|     52|Mighty Aphrodite ...| 0.2631578947368421|
|     25|Leaving Las Vegas...| 0.2222222222222222|
|     36|Dead Man Walking ...|

In [40]:
# Tests for 2nd extra function
ji_tm = ji_similarity_tags_movies(tags, movies, debug=True).orderBy('JI', ascending=False)
ji_tm.cache()
ji_tm.show()
assert ji_tm.columns == ["t1", "t2", "i", "u", "JI"], "unexpected columns"

rt = recommend_tags(2, movies, ji, n=4, debug=False)
rt.show()

+---------+--------+
|       t1|      m1|
+---------+--------+
|    oscar|   [110]|
|   travel|    [32]|
|    mafia|    [16]|
|   killer|[22, 47]|
|     brad|    [32]|
|    space|   [150]|
|    crime|   [101]|
|  fantasy|     [2]|
|     moon|   [150]|
|     seen|    [39]|
|      not|    [34]|
|    queue|[28, 40]|
|      mel|   [110]|
|     more|    [39]|
|pregnancy|     [5]|
|     adam|   [104]|
|    robin|     [2]|
|  netflix|[28, 40]|
|  teacher|    [31]|
| suspense|    [50]|
+---------+--------+
only showing top 20 rows

+---------+--------+
|       t2|      m2|
+---------+--------+
|    oscar|   [110]|
|   travel|    [32]|
|    mafia|    [16]|
|   killer|[22, 47]|
|     brad|    [32]|
|    space|   [150]|
|    crime|   [101]|
|  fantasy|     [2]|
|     moon|   [150]|
|     seen|    [39]|
|      not|    [34]|
|    queue|[28, 40]|
|      mel|   [110]|
|     more|    [39]|
|pregnancy|     [5]|
|     adam|   [104]|
|    robin|     [2]|
|  netflix|[28, 40]|
|  teacher|    [31]|
| suspen

## Made by the teacher
### TODO: Remove ?

In [None]:
def getTF(data, debug=False):
    f_wd = data\
       .groupBy('w','d')\
             .agg(F.count('w').alias('f_wd'))
    if debug:
        f_wd.orderBy('d','w').show()

    f_wd_max = f_wd\
             .groupBy('d')\
             .agg(F.max('f_wd').alias('f_wd_max'))
    if debug:
        f_wd_max.orderBy('d').show()
        
    TF = f_wd.join(f_wd_max, 'd')\
             .withColumn('TF', F.col('f_wd') / F.col('f_wd_max'))\
             .drop('f_wd','f_wd_max')
    return TF

def getIDF(data, debug=False):
    n_w_D = data\
           .groupBy('w')\
           .agg(F.countDistinct('d').alias('n_w_D'))
    if debug:
        n_w_D.orderBy('n_w_D',ascending=False).show()
        
    size_of_D = data.select('d').distinct().count()
    if debug:
        print("|D| = %d" % size_of_D)
    
    IDF = n_w_D\
            .withColumn('IDF', F.log2(size_of_D / F.col('n_w_D')))\
            .drop('n_w_D')
            
    return IDF
    
def getTF_IDF(data, debug=False):
    TF = getTF(data, debug)
    if debug:
        TF.orderBy(['d','TF'],ascending=[1,0]).show(TF.count())
    
    IDF = getIDF(data, debug)
    if debug:
        IDF.orderBy(['IDF','w'], ascending=[0,1]).show(IDF.count())

    TF_IDF = TF\
      .join(IDF,'w')\
      .withColumn('TF_IDF',F.col('TF') * F.col('IDF'))
        
    if debug:
        TF_IDF.orderBy(['d','TF_IDF','w'],ascending=[1,0,1]).show(TF_IDF.count())
    return TF_IDF
