In [1]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

23/03/18 11:08:00 WARN Utils: Your hostname, MacBook-Air-di-Emanuele.local resolves to a loopback address: 127.0.0.1; using 10.0.0.197 instead (on interface en0)
23/03/18 11:08:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/18 11:08:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
"""
Loading the training csv.
"""

training = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/train_plots_awards_genre_no_duplicates.csv', header=True, inferSchema=True)

                                                                                

In [24]:
training.select('Genre').distinct().count()

446

In [25]:
training.select('Genre').distinct().show(446, truncate=False)

+-------------------------------------------------------------------------------+
|Genre                                                                          |
+-------------------------------------------------------------------------------+
|anime                                                                          |
|biography, drama                                                               |
|disaster film                                                                  |
|biography, crime, drama                                                        |
|disaster                                                                       |
|romantic-comedy                                                                |
|biography                                                                      |
|action, adventure                                                              |
|family comedy                                                                  |
|animation, fami

In [3]:
"""
Loading the validation csv.
"""

validation = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/validation_plots_awards_genre_no_duplicates.csv', header=True, inferSchema=True)

In [4]:
"""
Loading the test csv.
"""

test = spark.read.option("escape", "\"").csv('../datasets/gpt_data/final/test_plots_awards_genre_no_duplicates.csv',
    header=True, inferSchema=True)

In [5]:
from pyspark.sql.functions import col, udf, when, split, regexp_replace, min, \
    mean, lower, array_contains, trim
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler

import re

def manage_awards(award):

    if award is not None:
        if re.search('win', award) or re.search('wins', award) or re.search('won', award):
            return 1
        else:
            return 0
    else:
        return False

def scaling_method(df, column_name, vector_column_name, scaled_column_name, placeholder):

     # Replace null values with a placeholder value, e.g. -1, using when/otherwise
    df = df.withColumn(column_name, when(col(column_name).isNull(), placeholder).otherwise(col(column_name)))

    # Create a VectorAssembler to convert the scalar column to a vector column
    assembler = VectorAssembler(inputCols=[column_name], outputCol=vector_column_name)
    df = assembler.transform(df)

    # Create the StandardScaler transformer and fit it to the data
    scaler = StandardScaler(inputCol=vector_column_name, outputCol=scaled_column_name, withMean=True, withStd=True)
    scaler_model = scaler.fit(df)

    scaled_data = scaler_model.transform(df)

    # Replace the placeholder values with null
    scaled_data = scaled_data.withColumn(scaled_column_name, when(col(column_name) == -1, None).
                                         otherwise(col(scaled_column_name)))
    scaled_data = scaled_data.withColumn(column_name, when(col(column_name) == -1, None).
                                         otherwise(col(column_name)))

    scaled_data = scaled_data.drop(vector_column_name)

    return scaled_data

def interval_check(r, year):
    limit0, limit1 = r.split('-')
    limit0 = int(limit0)
    limit1 = int(limit1)

    print("Limit0: ", limit0, "Limit1: ", limit1)
    print("Year: ", year)

    if limit0 <= year <= limit1:
        return 1
    else:
        return 0

def genre_preprocessing(df):

    # Everything to lower case
    df = df.withColumn('Genre', lower(col('Genre')))

    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'film ', ''))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ' in ', ' '))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ' of ', ' '))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), '\.', ''))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'drama based on the novel by russell banks;', 'drama'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'drama adapted from wajdi mouawad\'s play of the same name', 'drama'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'world war ii', 'war'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'biopic', 'biography'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'drama\[not in citation given\]', 'drama'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'kung fu', 'kungfu'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'historical romance based on colm tóibín\'s novel of the same name', 'romantic'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), '3-d', '3d'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'neo-noir', 'noir'))

    df = df.withColumn('Genre', when(col('Genre').isNull(), 'unknown').otherwise(col('Genre')))

    df = df.withColumn('Genre', regexp_replace(col('Genre'), '/', ', '))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ' ,', ','))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ', ', ','))

    df = df.withColumn('Genre', regexp_replace(col('Genre'), '-', ' '))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'sci fi', 'sci-fi'))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), ' ', ','))
    df = df.withColumn('Genre', regexp_replace(col('Genre'), 'martial,arts', 'martial arts'))

    # Split the columns
    df = df.withColumn('Genre', split(lower(col('Genre')), ','))

    print(df.columns)

    # Creating the one hot encoding
    unique_values = [str(row[0]) for row in df.selectExpr("explode(array_distinct(Genre))").distinct().collect()]
    for value in unique_values:
        df = df.withColumn(value, array_contains('Genre', value).cast('int'))

    df = df.drop('&', ' ', 'on', 'sf', 'i', 'jidaigeki', '', 'the', 'mouawad\'s', 'tóibín\'s', 'drama[not', 'given]', 'wajdi')

    return df

my_udf = udf(manage_awards, IntegerType())
scalar_udf = udf(lambda arr: float(arr[0]), DoubleType())
interval_check_udf = udf(interval_check, IntegerType())

def preprocessing_method(df):

    """
    This method will convert to integer all the years and will put to categorical weather
    a movie won any awards or not.
    :param df: dataframe
    :return: return a preprocessed dataframe
    """

    """
    Bringing to lower case the titles
    """

    df = df.withColumn('primaryTitle', lower(col('primaryTitle')))
    df = df.withColumn('originalTitle', lower(col('originalTitle')))

    """
    Here casting to integer all the numerical values
    """
    df = df.withColumn('startYear', col('startYear').cast('int'))
    df = df.withColumn('endYear', col('endYear').cast('int'))
    df = df.withColumn('numVotes', col('numVotes').cast('int'))
    df = df.withColumn('runtimeMinutes', col('runtimeMinutes').cast('int'))

    """
    Preprocessing years. We add range of years and then for each row we check in which range it is
    """

    ranges = ['1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989',
              '1990-1999', '2000-2009', '2010-2019', '2020-2023']

    for r in ranges:
        limit0, limit1 = r.split('-')
        limit0 = int(limit0)
        limit1 = int(limit1)
        df = df.withColumn(r, when((col('startYear') >= limit0) & (col('startYear') <= limit1), 1).otherwise(0))

    """
    Preprocess runtime minutes. If there are null values substitute with the mean.
    """

    mean_runtime = df.select(mean("runtimeMinutes").cast("int")).collect()[0][0]
    df = df.withColumn("runtimeMinutes", when(col("runtimeMinutes").isNull(), mean_runtime).
                       otherwise(col("runtimeMinutes")))

    """
    Plot preprocessing.
    """

    df = df.withColumn("plot", lower(col("plot")))

    """
    Preprocessing of the numVotes.
    """

    min_numVotes = df.select(min('numVotes')).collect()[0][0]
    df = df.withColumn('numVotes', when(col('numVotes').isNull(), min_numVotes).
                       otherwise(col('numVotes')))

    df = scaling_method(df, "numVotes", "vector_column_votes", "scaled_votes", -1)
    df = df.withColumn('numVotes', scalar_udf(df['scaled_votes']))


    """
    Preprocessing of genre. Create also a list of genres.
    """

    df = genre_preprocessing(df)

    """
    Dropping the columns.
    """

    df = df.drop('originalTitle', 'endYear', 'scaled_votes', 'Genre')

    print(df.columns)

    return df

In [6]:
"""
Preprocessing the training dataset.
"""

training_prep = preprocessing_method(training)

                                                                                

['_c0', 'tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'label', 'plot', 'awards', 'Genre', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'scaled_votes']


                                                                                

['_c0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'label', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'art', 'disaster', 'romantic', 'biography', 'film', 'bros', 'erotic', 'crime', '3d', 'fantasy', 'parody', 'name', 'paramount', 'documentary', 'based', 'concert', 'prison', 'monster', 'suspense', 'unknown', 'melodrama', 'sexploitation', 'action', 'love', 'animation', 'anthology', 'animated', 'samurai', 'mystery', 'same', 'biker', 'supernatural', 'silent', 'psychological', 'family', 'biographical', 'horror', 'nature', 'zombie', 'avant', 'bio', 'fiction', 'adult', 'exploitation', 'live', 'history', 'science', 'dark', 'biblical', 'novel', 'youth', 'colm', 'social', 'slasher', 'superhero', 'revenge', 'garde', 'national', 'citation', 'screwball', 'detective', 'age', 'musical', 'coming', 'adventure', 'spy', 'cult', 'political', 'outlaw', 'fr

In [7]:
"""
Saving to csv the training dataset.
"""

training_prep.toPandas().to_csv('../datasets/gpt_data/definitive/train.csv', index=False)

23/03/18 11:09:47 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [8]:
"""
Preprocessing the validation dataset.
"""

validation_prep = preprocessing_method(validation)

['Unnamed: 0', 'tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', 'Genre', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'scaled_votes']
['Unnamed: 0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'socio', 'biography', 'romantic', 'film', 'crime', 'fantasy', 'srikanth', 'documentary', 'suspense', 'unknown', 'melodrama', 'action', 'animation', 'animated', 'dance', 'splatter', 'supernatural', 'psychological', 'family', 'horror', 'bhavana', 'fiction', 'science', 'slasher', 'superhero', 'screwball', 'police', 'musical', 'adventure', 'wuxia', 'noir', 'drama', 'shravan', 'dramedy', 'western', 'docudrama', 'satire', 'nadeem', 'war', 'historical

In [9]:
"""
Saving to csv the validation dataset.
"""

validation_prep.toPandas().to_csv('../datasets/gpt_data/definitive/validation.csv', index=False)


                                                                                

In [10]:
"""
Preprocessing the test dataset.
"""

test_prep = preprocessing_method(test)

['Unnamed: 0', 'tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', 'Genre', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'scaled_votes']
['Unnamed: 0', 'tconst', 'primaryTitle', 'startYear', 'runtimeMinutes', 'numVotes', 'plot', 'awards', '1915-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009', '2010-2019', '2020-2023', 'anime', 'romantic', 'biography', 'film', 'crime', 'fantasy', 'set', 'documentary', 'superheroes', 'suspense', 'unknown', 'melodrama', 'action', 'animation', 'chan', 'samurai', 'animated', 'mystery', 'campus', 'years', 'psychological', 'arctic', 'family', 'horror', 'canadian', 'fiction', 'adult', 'live', 'science', 'prehistoric', 'music', 'social', 'superhero', 'charlie', 'citation', 'musical', 'adventure', 'spy', 'comedy[not', 'wuxia', 'noir', 'drama',

In [11]:
"""
Saving to csv the test dataset.
"""

test_prep.toPandas().to_csv('../datasets/gpt_data/definitive/test.csv', index=False)