In [184]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

In [185]:
"""
Loading the training csv.
"""

training = spark.read.option("escape", "\"").csv('../datasets/imdb/train_movies_extra_data_new.csv', header=True, inferSchema=True)

In [186]:
"""
Loading the validation csv.
"""

validation = spark.read.option("escape", "\"").csv('../datasets/imdb/validation_extra_data.csv', header=True, inferSchema=True)

In [187]:
"""
Loading the test csv.
"""

test = spark.read.option("escape", "\"").csv('../datasets/imdb/test_extra_data.csv', header=True,
    inferSchema=True)

In [188]:
from pyspark.sql.functions import col, udf, when, split, regexp_replace
from pyspark.sql.types import BooleanType

import re

def manage_awards(award):
    if re.search('win', award) or re.search('wins', award):
        return True
    else:
        return False

my_udf = udf(manage_awards, BooleanType())

def preprocessing_method(df):

    """
    This method will convert to integer all the years and will put to categorical weather
    a movie won any awards or not.
    :param df: dataframe
    :return: return a preprocessed dataframe
    """

    """
    Here casting to integer all the numerical values
    """
    df = df.withColumn('startYear', col('startYear').cast('int'))
    df = df.withColumn('endYear', col('endYear').cast('int'))
    df = df.withColumn('numVotes', col('numVotes').cast('int'))

    df = df.withColumn("boxoffice", regexp_replace(col("boxoffice"), "[$,]", "").cast("int"))
    df = df.withColumn('imdb_votes', regexp_replace(col('imdb_votes'), "[,]", "").cast('int'))

    """
    Awards transformed into True or False
    """

    df = df.withColumn('awards', my_udf(df['awards']))

    """
    Substitute numVotes value when null with the value of imdb votes
    """

    df = df.withColumn('numVotes', when(col('numVotes').isNull(), col('imdb_votes')).
                       otherwise(col('numVotes')))

    """
    Transformations from string to lists
    """

    df = df.withColumn('genre', split(col('genre'), ','))
    df = df.withColumn('actors', split(col('actors'), ','))
    df = df.withColumn('language', split(col('language'), ','))

    return df

In [189]:
training.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'label', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: string, endYear: string, runtimeMinutes: string, numVotes: double, label: boolean, genre: string, plot: string, actors: string, language: string, country: string, awards: string, boxoffice: string, rating: string, entry_type: string, production: string, imdb_votes: string, imdb_year: int]

In [190]:
"""
Preprocessing the training dataset
"""

training_prep = preprocessing_method(training)
training_prep.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'label', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: int, endYear: int, runtimeMinutes: string, numVotes: int, label: boolean, genre: array<string>, plot: string, actors: array<string>, language: array<string>, country: string, awards: boolean, boxoffice: int, rating: string, entry_type: string, production: string, imdb_votes: int, imdb_year: int]

In [191]:
validation.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: string, endYear: string, runtimeMinutes: string, numVotes: double, genre: string, plot: string, actors: string, language: string, country: string, awards: string, boxoffice: string, rating: string, entry_type: string, production: string, imdb_votes: string, imdb_year: int]

In [192]:
"""
Preprocessing the validation dataset
"""

validation_prep = preprocessing_method(validation)
validation_prep.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: int, endYear: int, runtimeMinutes: string, numVotes: int, genre: array<string>, plot: string, actors: array<string>, language: array<string>, country: string, awards: boolean, boxoffice: int, rating: string, entry_type: string, production: string, imdb_votes: int, imdb_year: int]

In [194]:
test.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: string, endYear: string, runtimeMinutes: string, numVotes: double, genre: string, plot: string, actors: string, language: string, country: string, awards: string, boxoffice: string, rating: string, entry_type: string, production: string, imdb_votes: string, imdb_year: int]

In [196]:
"""
Preprocessing the testing dataset
"""

test_prep = preprocessing_method(test)
test_prep.select('primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'genre', 'plot', 'actors', 'language', 'country', 'awards', 'boxoffice', 'rating', 'entry_type', 'production', 'imdb_votes', 'imdb_year')

DataFrame[primaryTitle: string, originalTitle: string, startYear: int, endYear: int, runtimeMinutes: string, numVotes: int, genre: array<string>, plot: string, actors: array<string>, language: array<string>, country: string, awards: boolean, boxoffice: int, rating: string, entry_type: string, production: string, imdb_votes: int, imdb_year: int]

In [201]:
training_prep.toPandas().to_csv('../datasets/models_dataset/train.csv', index=False)

In [None]:
validation_prep.toPandas().to_csv('../datasets/models_dataset/validation.csv', index=False)

In [200]:
test_prep.toPandas().to_csv('../datasets/models_dataset/test.csv', index=False)