In [None]:
import pandas as pd

In [None]:
#Loading Movielens Movies dataset
movies_df = pd.read_csv('./data/movielens/movies.csv')
movies_df.head(5)

#Extracticg and creating new column "year" from the title column
movies_df['year'] = movies_df['title'].str.extract(r'(\(\d{4}\)$)') #Extracts '(year)' from title
movies_df['year'] = movies_df['year'].str.extract(r'(\d{4})') #Extracts 'year' from '(year)'

#Checking and counting the number of 'NA' values in the 'year' column
print("Is NA values present: {}".format(movies_df['year'].isnull().values.any())) #Find if NA is present in the column 
print("NA values count: {}".format(movies_df['year'].isnull().values.sum())) #Count the number of NA in the column

#Replacing 'NA' values with '0'
movies_df['year'].fillna('0', inplace=True) #Replace NA with value 0

#Converting the datatype of 'year' column from string to int
movies_df = movies_df.astype({'year': int})
movies_df.dtypes

#Sorting the movies in ascending order by year
movies_df.sort_values(by='year', inplace=True)

#Creating a new dataframe with movies starting from 2009
new_movies_df = movies_df.loc[movies_df['year'] >= 2009]
print("Movies count {}".format(len(new_movies_df.index)))
new_movies_df.head(20)

In [None]:
#Movielens Links dataset
links_df = pd.read_csv('./data/movielens/external_links.csv')

#Checking and counting the number of 'NA' values in the 'tmdbid' and 'imdbid' columns
print("Does 'tmdbid' column contain NA values ?: {}".format(links_df['tmdbId'].isnull().values.any()))
print("NA Values count: {}".format(links_df['tmdbId'].isnull().values.sum()))
print("Does 'imdbid' column contain NA values ?: {}".format(links_df['imdbId'].isnull().values.any()))
print("NA Values count: {}".format(links_df['imdbId'].isnull().values.sum()))

#Replacing 'NA' values with '0'
links_df['tmdbId'].fillna(0, inplace=True) #Replace NA values with 0

#Checking the column datatypes
print("\n",links_df.dtypes)

#Converting the datatype of 'tmdbId' column from float to int
links_df = links_df.astype({'tmdbId': int})
links_df.head(5)

In [None]:
#merge movies and link dataframe on movieid
merged_movies_df = pd.merge(new_movies_df, links_df, how='inner', on = 'movieId')
merged_movies_df.head(5)

In [None]:
from DataManager import RestClient

In [None]:
RestClient(merged_movies_df).fetch_data()

In [None]:
from WebScraper import Oscars

In [None]:
Oscars().scrape()

In [None]:
from PostgreSqlHelper import Connection

In [None]:
cursor = Connection("motion_pictures").create_cursor()

In [None]:
#Movie tables along with its multivalued attributes
cursor.execute("""CREATE TABLE IF NOT EXISTS Movie (
                    movieid INT PRIMARY KEY, 
                    title VARCHAR NOT NULL,
                    tagline VARCHAR NOT NULL,
                    plot VARCHAR NOT NULL,
                    released_date VARCHAR NOT NULL,
                    certificate VARCHAR NOT NULL,
                    runtime INT,
                    award_wins INT,
                    award_nominations INT,
                    oscar_win BOOLEAN,
                    oscar_nomination BOOLEAN,
                    budget INT,
                    bo INT,
                    popularity REAL,
                    is_adult BOOLEAN,
                    imdb_rating REAL,
                    metascore INT,
                    rotten_tomatoes INT,
                    tmdb_rating REAL) """)

cursor.execute(""" CREATE TABLE IF NOT EXISTS Language (movie_id INT, lang VARCHAR NOT NULL,
                    PRIMARY KEY (movie_id, lang),
                    CONSTRAINT fk_movie_lan FOREIGN KEY (movie_id) REFERENCES Movie(movieid))""")

cursor.execute(""" CREATE TABLE IF NOT EXISTS Genre (movie_id INT, type VARCHAR NOT NULL,
                    PRIMARY KEY (movie_id, type),
                    CONSTRAINT fk_movie_genre FOREIGN KEY (movie_id) REFERENCES Movie(movieid))""")

cursor.execute(""" CREATE TABLE IF NOT EXISTS Producer (movie_id INT, name VARCHAR NOT NULL,
                    PRIMARY KEY (movie_id, name),
                    CONSTRAINT fk_movie_prod FOREIGN KEY (movie_id) REFERENCES Movie(movieid))""")



In [None]:
#Director Table
cursor.execute(""" CREATE TABLE IF NOT EXISTS Director (id SERIAL PRIMARY KEY, 
                    name VARCHAR NOT NULL,
                    oscar_nominations INT,
                    oscar_wins INT) """)


cursor.execute(""" CREATE TABLE IF NOT EXISTS Directed_by (dir_id INT,
                    movie_id INT,
                    PRIMARY KEY (dir_id, movie_id),
                    CONSTRAINT fk_movie_rel FOREIGN KEY (movie_id) REFERENCES Movie(movieid),
                    CONSTRAINT fk_dir_rel FOREIGN KEY (dir_id) REFERENCES Director(id))""")

In [None]:
#Writer Table
cursor.execute(""" CREATE TABLE IF NOT EXISTS Writer (id SERIAL PRIMARY KEY, 
                    name VARCHAR NOT NULL,
                    oscar_nominations INT,
                    oscar_wins INT) """)


cursor.execute(""" CREATE TABLE IF NOT EXISTS Written_by (writ_id INT,
                    movie_id INT,
                    PRIMARY KEY (writ_id, movie_id),
                    CONSTRAINT fk_movie_writ_rel FOREIGN KEY (movie_id) REFERENCES Movie(movieid),
                    CONSTRAINT fk_writ_rel FOREIGN KEY (writ_id) REFERENCES Writer(id))""")

In [None]:
#Actor Table
cursor.execute(""" CREATE TABLE IF NOT EXISTS Actor (id SERIAL PRIMARY KEY, 
                    name VARCHAR NOT NULL,
                    oscar_nominations INT,
                    oscar_wins INT) """)


cursor.execute(""" CREATE TABLE IF NOT EXISTS Acting_by (act_id INT,
                    movie_id INT,
                    PRIMARY KEY (act_id, movie_id),
                    CONSTRAINT fk_movie_act_rel FOREIGN KEY (movie_id) REFERENCES Movie(movieid),
                    CONSTRAINT fk_act_rel FOREIGN KEY (act_id) REFERENCES Actor(id))""")

In [None]:
#Parse data from Mongodb