# Data Processing

In [None]:
import pandas as pd
import numpy as np
import databricks.koalas as ks
import matplotlib.pyplot as plt
import plotly as px
import seaborn as sns

from ast import literal_eval

import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

ks.set_option('compute.ops_on_diff_frames', True)

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

In [None]:
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

### Import Data

In [None]:
ratings = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/ratings.csv")
ratings.printSchema()
print(ratings.count())
ratings = ratings.to_koalas()

                                                                                

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



                                                                                

26024289


In [None]:
ratings_small = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/ratings_small.csv")
ratings_small.printSchema()
print(ratings_small.count())
movies = ratings_small.to_koalas()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

100004


In [None]:
movies = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/movies_metadata.csv")
movies.printSchema()
print(movies.count())
movies = movies.to_koalas()

                                                                                

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: double (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: boolean (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nu

In [None]:
# import re
# from pathlib import Path

# p = "../data/raw/credits.csv"
# p2 = "../data/interim/credits.csv"

# with open(p, 'r') as f:
#     with open(p2, 'w') as f2:
#         for cnt, line in enumerate(f):
#             if cnt == 0:
#                 line = line.replace(',', '|')
#             else:
#                 line = re.sub(r',(?=(((?!\]).)*\[)|[^\[\]]*$)', '|', line)
#             f2.write(line)

# credits = spark.read.options(header='True', sep='|')\
#                    .csv("../data/interim/credits.csv")
# credits.printSchema()
# print(credits.count())
# credits = credits.to_koalas()

credits = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/credits.csv")
credits.printSchema()
print(credits.count())
credits = credits.to_koalas()

                                                                                

root
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- id: integer (nullable = true)

45476


In [None]:
keywords = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/keywords.csv")
keywords.printSchema()
print(keywords.count())
keywords = keywords.to_koalas()

root
 |-- id: integer (nullable = true)
 |-- keywords: string (nullable = true)

46419


In [None]:
links = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/links.csv")
links.printSchema()
print(links.count())
links = links.to_koalas()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

45843


In [None]:
links_small = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/links_small.csv")
links_small.printSchema()
print(links_small.count())
links_small = links_small.to_koalas()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

9125


## Clean Data

In [None]:
movies = movies.drop_duplicates(subset=['id'])
movies.shape

(45505, 24)

## Features Engineering

Before getting started with this 

- we need a metric to score or rate movie
- Calculate the score for every movie
- Sort the scores and recommend the best rated movie to the users.

We can use the average ratings of the movie as the score but using this won't be fair enough since a movie with 8.9 average rating and only 3 votes cannot be considered better than the movie with 7.8 as as average rating but 40 votes.

In [None]:
c = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.9)                              # We will use 90th percentile as our cutoff -> must have more votes than at least 90% of the movies in the list.

def weighted_rating(x, m=m, C=c):
    v = x['vote_count']
    r = x['vote_average']
    
    return (v/(v+m) * r) + (m/(m+v) * C)                            # Calculation based on the IMDB formula

q_movies = movies.copy().loc[movies['vote_count'] >= m]             # Filter out the movies that qualify for the chart
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

q_movies = q_movies.sort_values('score', ascending=False)           #Sort movies based on score calculated above
q_movies[['title', 'popularity', 'vote_count', 'vote_average', 'score']].head(10) #Print the top 15 movies

                                                                                

Unnamed: 0,title,popularity,vote_count,vote_average,score
314,The Shawshank Redemption,51.645403,8358,8.5,8.445869
834,The Godfather,41.109264,6024,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,34.457024,661,9.1,8.421453
12481,The Dark Knight,123.167259,12269,8.3,8.265477
2843,Fight Club,63.869599,9678,8.3,8.256385
292,Pulp Fiction,140.950236,8670,8.3,8.251406
522,Schindler's List,41.725123,4436,8.3,8.206639
23673,Whiplash,64.29999,4376,8.3,8.205404
5481,Spirited Away,41.048867,3968,8.3,8.196055
2211,Life Is Beautiful,39.39497,3643,8.3,8.187171


### Genre 