In [1]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

23/03/17 08:54:23 WARN Utils: Your hostname, MacBook-Air-di-Emanuele.local resolves to a loopback address: 127.0.0.1; using 10.0.0.197 instead (on interface en0)
23/03/17 08:54:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/17 08:54:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
"""
Loading the training csv.
"""

training = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/train_plots_awards_genre.csv', header=True, inferSchema=True)

                                                                                

In [72]:
training.select('plot').show(300, False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|plot                                                                                                                                                                                                                                                                                                                    

In [6]:
"""
Loading the validation csv.
"""

validation = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/validation_plots_awards_genre.csv', header=True, inferSchema=True)

In [7]:
"""
Loading the test csv.
"""

test = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/test_plots_awards_genre.csv',
    header=True, inferSchema=True)

In [146]:
from pyspark.sql.functions import col, udf, when, split, regexp_replace, min, \
    mean, lower, array_contains, lit
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler

import re

def manage_awards(award):

    if award is not None:
        if re.search('win', award) or re.search('wins', award) or re.search('won', award):
            return 1
        else:
            return 0
    else:
        return False

def scaling_method(df, column_name, vector_column_name, scaled_column_name, placeholder):

     # Replace null values with a placeholder value, e.g. -1, using when/otherwise
    df = df.withColumn(column_name, when(col(column_name).isNull(), placeholder).otherwise(col(column_name)))

    # Create a VectorAssembler to convert the scalar column to a vector column
    assembler = VectorAssembler(inputCols=[column_name], outputCol=vector_column_name)
    df = assembler.transform(df)

    # Create the StandardScaler transformer and fit it to the data
    scaler = StandardScaler(inputCol=vector_column_name, outputCol=scaled_column_name, withMean=True, withStd=True)
    scaler_model = scaler.fit(df)

    scaled_data = scaler_model.transform(df)

    # Replace the placeholder values with null
    scaled_data = scaled_data.withColumn(scaled_column_name, when(col(column_name) == -1, None).
                                         otherwise(col(scaled_column_name)))
    scaled_data = scaled_data.withColumn(column_name, when(col(column_name) == -1, None).
                                         otherwise(col(column_name)))

    scaled_data = scaled_data.drop(vector_column_name)

    return scaled_data

def interval_check(r, year):
    limit0, limit1 = r.split('-')
    limit0 = int(limit0)
    limit1 = int(limit1)

    print("Limit0: ", limit0, "Limit1: ", limit1)
    print("Year: ", year)

    if limit0 <= year <= limit1:
        return 1
    else:
        return 0

my_udf = udf(manage_awards, IntegerType())
scalar_udf = udf(lambda arr: float(arr[0]), DoubleType())
interval_check_udf = udf(interval_check, IntegerType())

def preprocessing_method(df):

    """
    This method will convert to integer all the years and will put to categorical weather
    a movie won any awards or not.
    :param df: dataframe
    :return: return a preprocessed dataframe
    """

    """
    Bringing to lower case the titles
    """

    df = df.withColumn('primaryTitle', lower(col('primaryTitle')))
    df = df.withColumn('originalTitle', lower(col('originalTitle')))

    """
    Here casting to integer all the numerical values
    """
    df = df.withColumn('startYear', col('startYear').cast('int'))
    df = df.withColumn('endYear', col('endYear').cast('int'))
    df = df.withColumn('numVotes', col('numVotes').cast('int'))
    df = df.withColumn('runtimeMinutes', col('runtimeMinutes').cast('int'))

    """
    Preprocessing years. We add range of years and then for each row we check in which range it is
    """

    ranges = ['1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989',
              '1990-1999', '2000-2009', '2010-2019', '2020-2023']

    for r in ranges:
        limit0, limit1 = r.split('-')
        limit0 = int(limit0)
        limit1 = int(limit1)
        df = df.withColumn(r, when((col('startYear') >= limit0) & (col('startYear') <= limit1), 1).otherwise(0))

    """
    Preprocess runtime minutes. If there are null values substitute with the mean.
    """

    mean_runtime = df.select(mean("runtimeMinutes").cast("int")).collect()[0][0]
    df = df.withColumn("runtimeMinutes", when(col("runtimeMinutes").isNull(), mean_runtime).
                       otherwise(col("runtimeMinutes")))

    """
    Plot preprocessing.
    """

    df = df.withColumn("plot", lower(col("plot")))

    """
    Preprocessing of the numVotes.
    """

    min_numVotes = df.select(min('numVotes')).collect()[0][0]
    df = df.withColumn('numVotes', when(col('numVotes').isNull(), min_numVotes).
                       otherwise(col('numVotes')))

    df = scaling_method(df, "numVotes", "vector_column_votes", "scaled_votes", -1)
    df = df.withColumn('numVotes', scalar_udf(df['scaled_votes']))


    """
    Preprocessing of genre. Create also a list of genres.
    """

    df = df.withColumn('Genre', when(col('Genre').isNull(), 'unknown').otherwise(col('Genre')))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), '/', ', '))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ' ,', ','))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ', ', ','))

    # Split the columns
    df = df.withColumn('Genre', split(lower(col('Genre')), ','))

    # Creating the one hot encoding
    unique_values = [str(row[0]) for row in df.selectExpr("explode(array_distinct(Genre))").distinct().collect()]
    for value in unique_values:
        df = df.withColumn(value, array_contains('Genre', value).cast('int'))

    """
    Dropping the columns.
    """

    df = df.drop('originalTitle', 'endYear', 'Genre', 'scaled_votes')

    print(df.columns)

    return df

In [147]:
"""
Preprocessing the training dataset.
"""

training_prep = preprocessing_method(training)

['_c0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'label', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'disaster film', 'dark fantasy', 'buddy comedy', 'disaster', 'romantic-comedy', 'biography', ' fantasy', 'romantic', 'family comedy', 'crime comedy', 'erotic', ' action', 'fantasy adventure comedy', 'crime', 'world war i drama', 'anime action adventure', 'sf', 'experimental', 'comedy horror', 'fantasy', 'parody', 'action-comedy', 'comedy drama anime', 'prison drama', 'war film', 'fantasy adventure', 'documentary', 'romance thriller', 'gangster film', 'supernatural horror', 'animated film', 'comedy ', 'romance ', 'computer animated', 'concert', 'action comedy', 'monster', 'suspense', 'literary adaptation', 'unknown', 'melodrama', "drama adapted from wajdi mouawad's play of the same name", 'sexploitation', 'action', 'love', 'suspense th

In [148]:
"""
Saving to csv the training dataset.
"""

training_prep.toPandas().to_csv('../datasets/gpt_data/definitive/train.csv', index=False)

                                                                                

In [149]:
"""
Preprocessing the validation dataset.
"""

validation_prep = preprocessing_method(validation)

['Unnamed: 0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'historical horror', 'biography', ' action', 'crime', 'fantasy', 'srikanth', 'war comedy', 'documentary', 'supernatural horror', 'animated film', 'suspense', 'unknown', 'melodrama', 'socio-fantasy', 'action', 'animation', 'musical comedy', ' thriller', 'animated', 'dance', 'splatter', 'science fiction', 'world war ii', 'family', 'horror', 'war drama', 'historical drama', 'bhavana', 'police drama', 'slasher', 'superhero', 'romantic drama', 'animation martial arts action-comedy', 'film noir', 'science-fiction', 'musical', 'adventure', 'epic western', 'wuxia', 'drama', 'romantic comedy', 'dramedy', 'crime thriller', 'sci-fi comedy', ' adventure', 'nadeem-shravan', 'western', 'comedy-drama', 'docudrama', 'satire', 'biopic', 'war', 'mockumen

In [150]:
"""
Saving to csv the validation dataset.
"""

validation_prep.toPandas().to_csv('../datasets/gpt_data/definitive/validation.csv', index=False)


In [151]:
"""
Preprocessing the test dataset.
"""

test_prep = preprocessing_method(test)

['Unnamed: 0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'biography', 'romantic', 'epic drama set 4', 'crime comedy', 'crime', 'fantasy', 'documentary', 'animated film', 'superheroes', 'romance ', '000 years ago in the canadian arctic', 'action comedy', 'suspense', 'unknown', 'melodrama', 'period comedy', 'action', 'animation', 'musical comedy', 'animated', 'prehistoric adventure', 'mystery', 'literary drama', 'science fiction', 'world war ii', 'family', 'horror', 'campus drama', 'war drama', 'comedy[not in citation given]', 'historical drama', 'action - comedy', 'adult', 'family drama', 'charlie chan', 'music', 'superhero', 'science fantasy', 'romantic drama', 'film noir', 'musical', 'adventure', 'spy', 'wuxia', 'spy comedy', 'drama', 'social ', 'romantic comedy', 'dramedy', ' western', 'sci

In [152]:
"""
Saving to csv the test dataset.
"""

test_prep.toPandas().to_csv('../datasets/gpt_data/definitive/test.csv', index=False)