In [44]:
# Import PySpark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.appName('MyApp').getOrCreate()

In [45]:
movies = spark.read.option("escape", "\"").csv('../datasets/imdb/train_movies_extra_data_new.csv', header=True, inferSchema=True)

In [46]:
from pyspark.sql.functions import col, udf, when, split, regexp_replace
from pyspark.sql.types import IntegerType, BooleanType, FloatType, StringType

import re

def manage_awards(award):
    if re.search('win', award) or re.search('wins', award):
        return True
    else:
        return False

my_udf = udf(manage_awards, BooleanType())

def preprocessing_method(df):

    """
    This method will convert to integer all the years and will put to categorical weather
    a movie won any awards or not.
    :param df: dataframe
    :return: return a preprocessed dataframe
    """

    """
    Here casting to integer all the numerical values
    """
    df = df.withColumn('startYear', col('startYear').cast('int'))
    df = df.withColumn('endYear', col('endYear').cast('int'))
    df = df.withColumn('numVotes', col('numVotes').cast('int'))

    df = df.withColumn("boxoffice", regexp_replace(col("boxoffice"), "[$,]", "").cast("int"))
    df = df.withColumn('imdb_votes', regexp_replace(col('imdb_votes'), "[,]", "").cast('int'))

    """
    Awards transformed into True or False
    """
    df = df.withColumn('awards', my_udf(df['awards']))

    """
    Substitute numVotes value when null with the value of imdb votes
    """

    df = df.withColumn('numVotes', when(col('numVotes').isNull(), col('imdb_votes')).
                       otherwise(col('numVotes')))

    """
    Transformations from string to lists
    """
    df = df.withColumn('genre', split(col('genre'), ','))
    df = df.withColumn('actors', split(col('actors'), ','))
    df = df.withColumn('language', split(col('language'), ','))

    return df

In [47]:
movies.select('imdb_votes').show()

+----------+
|imdb_votes|
+----------+
|     2,183|
|     5,649|
|     6,266|
|    10,288|
|    18,984|
|     3,539|
|     2,415|
|     1,255|
|     1,456|
|    36,952|
|    86,913|
|     2,209|
|     3,535|
|    14,024|
|    54,135|
|     3,304|
|    10,962|
|     5,352|
|    14,688|
|     4,032|
+----------+
only showing top 20 rows



In [48]:
movies = preprocessing_method(movies)

In [51]:
movies.select('numVotes', 'imdb_votes').show()

+--------+----------+
|numVotes|imdb_votes|
+--------+----------+
|    1898|      2183|
|    5376|      5649|
|    5842|      6266|
|    9652|     10288|
|   17887|     18984|
|    3285|      3539|
|    2275|      2415|
|    1255|      1255|
|    1456|      1456|
|   33562|     36952|
|   83177|     86913|
|    2038|      2209|
|    3535|      3535|
|   14024|     14024|
|   50707|     54135|
|    2967|      3304|
|   10311|     10962|
|    4904|      5352|
|   13595|     14688|
|    3600|      4032|
+--------+----------+
only showing top 20 rows

