In [47]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

In [48]:
"""
Loading the training csv.
"""

training = spark.read.option("escape", "\"").csv('../datasets/imdb/train_movies_extra_data_new.csv', header=True, inferSchema=True)

In [49]:
"""
Loading the validation csv.
"""

validation = spark.read.option("escape", "\"").csv('../datasets/imdb/validation_extra_data.csv', header=True, inferSchema=True)

In [50]:
"""
Loading the test csv.
"""

test = spark.read.option("escape", "\"").csv('../datasets/imdb/test_extra_data.csv', header=True,
    inferSchema=True)

In [70]:
from pyspark.sql.functions import col, udf, when, split, regexp_replace, min, \
    mean, lower, array_contains
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler

import re

def manage_awards(award):

    if award is not None:
        if re.search('win', award) or re.search('wins', award) or re.search('won', award):
            return 1
        else:
            return 0
    else:
        return False

def scaling_method(df, column_name, vector_column_name, scaled_column_name, placeholder):

     # Replace null values with a placeholder value, e.g. -1, using when/otherwise
    df = df.withColumn(column_name, when(col(column_name).isNull(), placeholder).otherwise(col(column_name)))

    # Create a VectorAssembler to convert the scalar column to a vector column
    assembler = VectorAssembler(inputCols=[column_name], outputCol=vector_column_name)
    df = assembler.transform(df)

    # Create the StandardScaler transformer and fit it to the data
    scaler = StandardScaler(inputCol=vector_column_name, outputCol=scaled_column_name, withMean=True, withStd=True)
    scaler_model = scaler.fit(df)

    scaled_data = scaler_model.transform(df)

    # Replace the placeholder values with null
    scaled_data = scaled_data.withColumn(scaled_column_name, when(col(column_name) == -1, None).
                                         otherwise(col(scaled_column_name)))
    scaled_data = scaled_data.withColumn(column_name, when(col(column_name) == -1, None).
                                         otherwise(col(column_name)))

    scaled_data = scaled_data.drop(vector_column_name)

    return scaled_data

my_udf = udf(manage_awards, IntegerType())
scalar_udf = udf(lambda arr: float(arr[0]), DoubleType())

def preprocessing_method(df):

    """
    This method will convert to integer all the years and will put to categorical weather
    a movie won any awards or not.
    :param df: dataframe
    :return: return a preprocessed dataframe
    """

    """
    Here casting to integer all the numerical values
    """
    df = df.withColumn('startYear', col('startYear').cast('int'))
    df = df.withColumn('endYear', col('endYear').cast('int'))
    df = df.withColumn('numVotes', col('numVotes').cast('int'))
    df = df.withColumn('runtimeMinutes', col('runtimeMinutes').cast('int'))

    """
    Preprocess boxoffice, and put the lowest value if it is null.
    """

    df = df.withColumn("boxoffice", regexp_replace(col("boxoffice"), "[$,]", "").cast("int"))
    min_boxoffice = df.agg(min('boxoffice')).collect()[0][0]
    df = df.withColumn("boxoffice", when(col("boxoffice").isNull(), min_boxoffice).otherwise(col("boxoffice")))

    """
    Preprocess runtime minutes. If there are null values substitute with the mean.
    """

    mean_runtime = df.select(mean("runtimeMinutes").cast("int")).collect()[0][0]
    df = df.withColumn("runtimeMinutes", when(col("runtimeMinutes").isNull(), mean_runtime).
                       otherwise(col("runtimeMinutes")))

    """
    Plot preprocessing.
    """

    df = df.withColumn("plot", lower(col("plot")))

    """
    Preprocess the imdb votes.
    """

    df = df.withColumn('imdb_votes', regexp_replace(col('imdb_votes'), "[,]", "").cast('int'))

    """
    Awards transformed into True or False.
    """

    df = df.withColumn('awards', my_udf(df['awards']))

    """
    Preprocessing of the numVotes.
    """

    min_imdb_votes = df.agg(min('imdb_votes')).collect()[0][0]
    df = df.withColumn("imdb_votes", when(col("imdb_votes").isNull(), min_imdb_votes).otherwise(col("imdb_votes")))

    df = scaling_method(df, "numVotes", "vector_column_votes", "scaled_votes", -1)
    df = scaling_method(df, "imdb_votes", "vector_column_imbd_votes", "scaled_imdb_votes", -1)

    df = df.withColumn('scaled_votes', when(col('scaled_votes').isNull(), col('scaled_imdb_votes')).
                       otherwise(col('scaled_votes')))

    df = df.withColumn('numVotes', scalar_udf(df['scaled_votes']))

    """
    Preprocessing startYear. If it is null, we substitute with the year from imdb.
    """

    df = df.withColumn('startYear', when(col('startYear').isNull(), col('imdb_year')).
                       otherwise(col('startYear')))

    """
    Transformations from string to lists, removing the spaced and then putting everything to lower and then
    splitting.
    """

    df = df.withColumn('country', split(lower(regexp_replace(col('country'), ', ', ',')), ','))
    df = df.withColumn('genre', split(lower(regexp_replace(col('genre'), ', ', ',')), ','))
    df = df.withColumn('actors', split(lower(regexp_replace(col('actors'), ', ', ',')), ','))

    """
    Doing one hot encoding for the genre column
    """

    unique_values = [str(row[0]) for row in df.selectExpr("explode(array_distinct(genre))").distinct().collect()]
    for value in unique_values:
        df = df.withColumn(value, array_contains('genre', value).cast('int'))

    """
    Dropping the columns.
    """

    df = df.drop('_c0', 'primaryTitle', 'originalTitle', 'endYear', 'language', 'rating', 'entry_type', 'production', 'imdb_year', 'genre', 'actors', 'imdb_votes', 'scaled_imdb_votes', 'scaled_votes')

    print(df.columns)

    return df

In [71]:
"""
Preprocessing the training dataset.
"""

training_prep = preprocessing_method(training)

['tconst', 'startYear', 'runtimeMinutes', 'numVotes', 'label', 'plot', 'country', 'awards', 'boxoffice', 'biography', 'crime', 'fantasy', 'documentary', 'news', 'action', 'animation', 'mystery', 'sport', 'family', 'horror', 'adult', 'film-noir', 'history', 'music', 'musical', 'adventure', 'drama', 'western', 'war', 'short', 'romance', 'thriller', 'sci-fi', 'comedy']


In [72]:
"""
Saving to csv the training dataset.
"""

training_prep.toPandas().to_csv('../datasets/models_dataset/train.csv', index=False)

                                                                                

In [73]:
"""
Preprocessing the validation dataset.
"""

validation_prep = preprocessing_method(validation)

['tconst', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'country', 'awards', 'boxoffice', 'biography', 'crime', 'fantasy', 'documentary', 'news', 'action', 'animation', 'mystery', 'sport', 'family', 'horror', 'film-noir', 'history', 'music', 'musical', 'adventure', 'drama', 'western', 'war', 'romance', 'thriller', 'sci-fi', 'comedy']


In [74]:
"""
Saving to csv the validation dataset.
"""

validation_prep.toPandas().to_csv('../datasets/models_dataset/validation.csv', index=False)


                                                                                

In [75]:
"""
Preprocessing the test dataset.
"""

test_prep = preprocessing_method(test)

['tconst', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'country', 'awards', 'boxoffice', 'biography', 'crime', 'fantasy', 'documentary', 'action', 'animation', 'mystery', 'sport', 'family', 'horror', 'film-noir', 'history', 'music', 'musical', 'adventure', 'drama', 'western', 'war', 'romance', 'thriller', 'sci-fi', 'comedy']


In [76]:
"""
Saving to csv the test dataset.
"""

test_prep.toPandas().to_csv('../datasets/models_dataset/test.csv', index=False)