# ETL Project - Transform

In [55]:
import pandas as pd

## Import the following files for Transformation

###  Netflix
- netflix_final.csv
- movies_revenue.csv

### Budget
- movies_num_final.csv
- movies_budget_df.csv
- movies_budget_df_revenue.csv

## Step I:  Merge

### Netflix
- Merge netflix_final and movies_revenue on title: **netflix_revenu_df**


### Budget
- Merge movies_budget_df and movies_budget_df_revenue on id: **movies_budget_revenue_df**

In [56]:
netflix = pd.read_csv('netflix_final.csv')
netflix.head()

Unnamed: 0,title,country,release_year,rating
0,Norm of the North: King Sized Adventure,"United States, India, South Korea, China",2019,TV-PG
1,Jandino: Whatever it Takes,United Kingdom,2016,TV-MA
2,#realityhigh,United States,2017,TV-14
3,Automata,"Bulgaria, United States, Spain, Canada",2014,R
4,Fabrizio Copano: Solo pienso en mi,Chile,2017,TV-MA


In [84]:
new_netflix = netflix[['title']]
new_netflix.head()

Unnamed: 0,title
0,Norm of the North: King Sized Adventure
1,Jandino: Whatever it Takes
2,#realityhigh
3,Automata
4,Fabrizio Copano: Solo pienso en mi


In [89]:
movies_revenue = pd.read_csv('movie_revenue.csv')
new_movie_revenue = movies_revenue[['title','revenue','vote_average','popularity']]
new_movie_revenue.head()

Unnamed: 0,title,revenue,vote_average,popularity
0,Norm of the North: King Sized Adventure,1442504,7.6,8.42
1,Jandino: Whatever it Takes,0,5.0,0.6
2,Automata,0,5.8,25.466
3,Fabrizio Copano: solo pienso en mí,0,5.3,1.126
4,Good People,0,5.5,28.195


In [99]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect
from pandas.io import sql
from sqlalchemy import Integer, ForeignKey, String, Column
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

In [81]:
rds_connection_string = "postgres:kyk32211@localhost:5432/ETL_DB"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine

Engine(postgresql://postgres:***@localhost:5432/ETL_DB)

In [124]:
engine.table_names()

['new_movie_revenue', 'new_netflix']

In [90]:
new_movie_revenue.to_sql(name='new_movie_revenue', con=engine, if_exists='append', index=False)

In [92]:
new_netflix.to_sql(name='new_netflix', con=engine, if_exists='append', index=False)

In [118]:
movie_revenue_df = pd.read_sql_query('select * from new_movie_revenue', con=engine)
movie_revenue_df.head()

Unnamed: 0,title,revenue,vote_average,popularity
0,Norm of the North: King Sized Adventure,1442504,8,8
1,Jandino: Whatever it Takes,0,5,1
2,Automata,0,6,25
3,Fabrizio Copano: solo pienso en mí,0,5,1
4,Good People,0,6,28


In [117]:
netflix_df = pd.read_sql_query('select * from new_netflix', con=engine)
netflix_df.head()

Unnamed: 0,title
0,Norm of the North: King Sized Adventure
1,Jandino: Whatever it Takes
2,#realityhigh
3,Automata
4,Fabrizio Copano: Solo pienso en mi


In [102]:
Base = declarative_base()

In [106]:
class MOVIE(Base):
    __tablename__ = 'movie'
    id = Column(Integer, primary_key=True)
    name = Column(String)

    new_netflix_title = Column(String, ForeignKey("new_netflix.title"))
    new_movie_revenue_title = Column(String, ForeignKey("new_movie_revenue.title"))

    new_netflix_address = relationship("new_netflix", foreign_keys=[new_netflix.title])
    new_movie_revenue_address = relationship("new_netflix", foreign_keys=[new_movie_revenue.title])

In [120]:
session = Session(engine)

In [123]:
session.query(netflix_df.join(movie_revenue_df, netflix_df.title == movie_revenue_df.title, isouter=True)) 

ValueError: Can only compare identically-labeled Series objects

In [133]:
movie_num = pd.read_csv('movies_num_final.csv')

In [134]:
movie_num = movie_num.rename(columns={"Movie":"title"})
movie_num.head()

Unnamed: 0,ReleaseDate,title,ProductionBudget,DomesticGross,WorldwideGross
0,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,797,800,564"
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,063,875","$1,045,663,875"
2,"Apr 22, 2015",Avengers: Age of Ultron,"$365,000,000","$459,005,868","$1,396,099,202"
3,"Dec 16, 2015",Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,065,478,084"
4,"Apr 25, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,044,540,523"


In [126]:
movie_budget = pd.read_csv('movies_budget_df.csv')
movie_budget.head()

Unnamed: 0,id,title,vote_count,vote_average
0,299534,Avengers: Endgame,15589,8.3
1,1865,Pirates of the Caribbean: On Stranger Tides,10375,6.5
2,99861,Avengers: Age of Ultron,16755,7.3
3,299536,Avengers: Infinity War,20088,8.3
4,285,Pirates of the Caribbean: At World's End,10329,7.2


In [128]:
movie_budget_revenue = pd.read_csv('movies_budget_df_revenue.csv')
movie_budget_revenue.head()

Unnamed: 0,id,revenue,runtime,popularity
0,299534,2797800564,181.0,193.659
1,1865,1045713802,136.0,79.193
2,99861,1405403694,141.0,85.183
3,299536,2046239637,149.0,229.85
4,285,961000000,169.0,77.611


In [130]:
merge_revenue = pd.merge(movie_budget, movie_budget_revenue, on="id")
merge_revenue

Unnamed: 0,id,title,vote_count,vote_average,revenue,runtime,popularity
0,299534,Avengers: Endgame,15589,8.3,2797800564,181.0,193.659
1,1865,Pirates of the Caribbean: On Stranger Tides,10375,6.5,1045713802,136.0,79.193
2,99861,Avengers: Age of Ultron,16755,7.3,1405403694,141.0,85.183
3,299536,Avengers: Infinity War,20088,8.3,2046239637,149.0,229.85
4,299536,Avengers: Infinity War,20088,8.3,2046239637,149.0,229.85


In [135]:
merge_df = pd.merge(merge_revenue, movie_num, on="title")
merge_df.head()

Unnamed: 0,id,title,vote_count,vote_average,revenue,runtime,popularity,ReleaseDate,ProductionBudget,DomesticGross,WorldwideGross
0,299534,Avengers: Endgame,15589,8.3,2797800564,181.0,193.659,"Apr 23, 2019","$400,000,000","$858,373,000","$2,797,800,564"
1,1865,Pirates of the Caribbean: On Stranger Tides,10375,6.5,1045713802,136.0,79.193,"May 20, 2011","$379,000,000","$241,063,875","$1,045,663,875"
2,99861,Avengers: Age of Ultron,16755,7.3,1405403694,141.0,85.183,"Apr 22, 2015","$365,000,000","$459,005,868","$1,396,099,202"
3,299536,Avengers: Infinity War,20088,8.3,2046239637,149.0,229.85,"Apr 25, 2018","$300,000,000","$678,815,482","$2,044,540,523"
4,299536,Avengers: Infinity War,20088,8.3,2046239637,149.0,229.85,"Apr 25, 2018","$300,000,000","$678,815,482","$2,044,540,523"


## Step II:  Clean and Wrangle


###  **netflix_revenu_df**

- Inspect dataframe
- Check for missing values
- Drop Nan values and not zero values !IMPORTANT
- Convert numeric columns to numeric if necessary
- Save to a new dataframe and export as csv

###  **movies_budget_revenue_df**

- Inspect dataframe
- Check for missing values
- Drop Nan values and not zero values !IMPORTANT
- Remove $ and (,) in numeric columns
- Convert numeric columns to numeric if necessary
- Save to a new dataframe and export as csv