In [10]:
import pandas as pd
from sqlalchemy import create_engine
import configparser
import io
import boto3
import psycopg2
from functools import reduce

In [11]:
config = configparser.ConfigParser()

# read the configuration file
config.read('multi_config.ini')

# get all the connections
config.sections()

['postgresql', 'aws_s3', 'csv', 'stmplib']

In [12]:
'''
Authenticate the Postgres and S3 database by getting the credentials from the config file
'''
database = config.get('postgresql', 'database')
user = config.get('postgresql', 'user')
password = config.get('postgresql', 'password')
host = config.get('postgresql', 'host')
port = config.get('postgresql', 'port')

# AWS Credentials
service_name = config.get('aws_s3', 'service_name')
region_name = config.get('aws_s3', 'region_name')
aws_access_key_id = config.get('aws_s3', 'aws_access_key_id')
aws_secret_access_key = config.get('aws_s3', 'aws_secret_access_key')
s3_bucket = config.get('aws_s3', 's3_bucket')

# check creditials
print("Authentication successful \n")
print(f'The database is "{database}" and the service_name is "{service_name}"')

Authentication successful 

The database is "film_data" and the service_name is "s3"


In [17]:
# load local csv into dataframe
extracted_local_df = pd.read_csv('IMDB-Movie-Data-Local.csv') 
#load csv to dataframe to be loaded into postgres sql
to_load_warehouse_df = pd.read_csv('IMDB-Movie-Data-Postgres.csv') 

In [12]:
to_load_warehouse_df

Unnamed: 0,Title,Actors,Year,Runtime_Minutes
0,Guardians of the Galaxy,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121
1,Prometheus,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124
2,Split,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117
3,Sing,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108
4,Suicide Squad,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123
...,...,...,...,...
995,Secret in Their Eyes,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111
996,Hostel: Part II,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94
997,Step Up 2: The Streets,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98
998,Search Party,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93


In [20]:
## load to postgres
# determine table name
table_name = 'IMDB_movie_data'

# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}/{database}', pool_recycle=3600)
# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect()
# Upload data to sql database
to_load_warehouse_df.to_sql(table_name, dbConnection, if_exists='fail')
print(f'PostgreSQL Table, "{table_name}", has been created successfully.')

dbConnection.close()

ValueError: Table 'IMDB_movie_data' already exists.

In [18]:
## extract from s3
s3_resource = boto3.resource(
    service_name = service_name,
    region_name = region_name, 
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

string_io = io.BytesIO()
s3_resource.Object(s3_bucket, "IMDB-Movie-Data-S3.csv").download_fileobj(string_io)
s3_contents = string_io.getvalue()

extracted_datalake_df = pd.read_csv(io.BytesIO(s3_contents))
extracted_datalake_df

Unnamed: 0,Title,Genre,Description,Director
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn
1,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott
2,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan
3,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet
4,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer
...,...,...,...,...
995,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray
996,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth
997,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu
998,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong


In [21]:
## extract from postgres sql
# Create an engine instance
alchemyEngine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}/{database}', pool_recycle=3600);

# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect();

# Read data from PostgreSQL database table and load into a DataFrame instance
sql = f"select * from \"{table_name}\""
extracted_warehouse_df = pd.read_sql(sql, dbConnection);
pd.set_option('display.expand_frame_repr', False);

if dbConnection:
    dbConnection.close()
print("PostgreSQL connection is closed")

PostgreSQL connection is closed


In [22]:
# need to set index in order to join 
extracted_warehouse_df = extracted_warehouse_df.set_index('index')

In [23]:
# use merge function to join all three dataframes on index 
movie_data_df = pd.merge(pd.merge(extracted_datalake_df, extracted_local_df, left_index= True, right_index= True), extracted_warehouse_df, left_index= True, right_index= True)

In [24]:
# check joined dataframe- can see all three titles
movie_data_df.head()

Unnamed: 0,Title_x,Genre,Description,Director,Title_y,Rating,Votes,Revenue_Millions,Metascore,Title,Actors,Year,Runtime_Minutes
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,Guardians of the Galaxy,8.1,757074,333.13,76.0,Guardians of the Galaxy,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121
1,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,Prometheus,7.0,485820,126.46,65.0,Prometheus,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124
2,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,Split,7.3,157606,138.12,62.0,Split,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117
3,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,Sing,7.2,60545,270.32,59.0,Sing,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108
4,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,Suicide Squad,6.2,393727,325.02,40.0,Suicide Squad,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123


In [25]:
#compare if the join was successful if the titles match 
if movie_data_df['Title_x'].equals(movie_data_df['Title_y']) & movie_data_df['Title_x'].equals(movie_data_df['Title']) & movie_data_df['Title_y'].equals(movie_data_df['Title']):
    print('The join was successful')
else: 
    print('The join was unsuccessful')

The join was successful


In [26]:
# check and remove unnecessary title columns 
if movie_data_df['Title_x'].equals(movie_data_df['Title_y']) & movie_data_df['Title_x'].equals(movie_data_df['Title']) & movie_data_df['Title_y'].equals(movie_data_df['Title']):
    movie_data_df = movie_data_df.drop(['Title_y', 'Title_x'], axis= 1)
else: 
    print('The join was unsuccessful')

In [27]:
#check new df
movie_data_df.head()

Unnamed: 0,Genre,Description,Director,Rating,Votes,Revenue_Millions,Metascore,Title,Actors,Year,Runtime_Minutes
0,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,8.1,757074,333.13,76.0,Guardians of the Galaxy,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121
1,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,7.0,485820,126.46,65.0,Prometheus,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124
2,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,7.3,157606,138.12,62.0,Split,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117
3,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,7.2,60545,270.32,59.0,Sing,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108
4,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,6.2,393727,325.02,40.0,Suicide Squad,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123


In [22]:
check_df = pd.read_csv('IMDB-Movie-Data.csv')

check_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [111]:
movie_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Genre             1000 non-null   object 
 1   Description       1000 non-null   object 
 2   Director          1000 non-null   object 
 3   Rating            1000 non-null   float64
 4   Votes             1000 non-null   int64  
 5   Revenue_Millions  872 non-null    float64
 6   Metascore         936 non-null    float64
 7   Title             1000 non-null   object 
 8   Actors            1000 non-null   object 
 9   Year              1000 non-null   int64  
 10  Runtime_Minutes   1000 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 86.1+ KB


In [28]:
sum_report_df = movie_data_df.groupby(['Genre']).sum().sort_values(by = 'Revenue_Millions', ascending = False)
mean_report_df = movie_data_df.groupby(['Genre']).mean().sort_values(by = 'Revenue_Millions', ascending = False)

  sum_report_df = movie_data_df.groupby(['Genre']).sum().sort_values(by = 'Revenue_Millions', ascending = False)
  mean_report_df = movie_data_df.groupby(['Genre']).mean().sort_values(by = 'Revenue_Millions', ascending = False)


In [122]:
mean_report_df

Unnamed: 0_level_0,Rating,Votes,Revenue_Millions,Metascore,Year,Runtime_Minutes
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adventure,Drama,Fantasy",6.150000,274020.000000,276.008000,64.200000,2012.000000,117.333333
"Adventure,Fantasy",7.733333,522664.333333,272.156667,61.000000,2013.000000,158.000000
"Action,Sci-Fi",7.300000,484529.333333,232.780000,64.000000,2012.333333,117.333333
"Action,Adventure",6.400000,334459.000000,223.740000,57.333333,2011.000000,124.333333
"Animation,Adventure,Comedy",7.200000,219002.407407,221.336538,68.730769,2013.037037,97.000000
...,...,...,...,...,...,...
"Fantasy,Horror,Thriller",3.200000,173.000000,,57.000000,2016.000000,133.000000
"Fantasy,Mystery,Thriller",6.900000,72533.000000,,66.000000,2009.000000,99.000000
"Mystery,Thriller,Western",7.100000,13004.000000,,44.000000,2016.000000,148.000000
"Romance,Sci-Fi",6.100000,512.000000,,65.000000,2016.000000,118.000000


In [125]:
test_sum_report_df = check_df.groupby(['Genre']).sum().sort_values(by = 'Revenue (Millions)', ascending = False)
test_mean_report_df = check_df.groupby(['Genre']).mean().sort_values(by = 'Revenue (Millions)', ascending = False)

  test_sum_report_df = check_df.groupby(['Genre']).sum().sort_values(by = 'Revenue (Millions)', ascending = False)
  test_mean_report_df = check_df.groupby(['Genre']).mean().sort_values(by = 'Revenue (Millions)', ascending = False)


In [126]:
test_sum_report_df

Unnamed: 0_level_0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Action,Adventure,Sci-Fi",16615,100623,6368,336.2,18582076,10461.51,2756.0
"Animation,Adventure,Comedy",11320,54352,2619,194.4,5913065,5754.75,1787.0
"Action,Adventure,Fantasy",10501,54317,3190,170.9,7816851,5248.29,1300.0
"Adventure,Family,Fantasy",5838,28165,1715,93.8,2640649,2201.47,787.0
Comedy,16291,64440,3229,193.0,3685529,1941.81,1458.0
...,...,...,...,...,...,...,...
"Animation,Drama,Romance",862,2016,129,8.4,2421,0.00,80.0
"Comedy,Sci-Fi",399,2015,85,6.0,26587,0.00,31.0
"Comedy,Western",643,2015,119,4.8,31149,0.00,18.0
"Drama,Family",696,2009,93,8.1,177602,0.00,61.0


In [24]:
sum_report_df

Unnamed: 0_level_0,Rating,Votes,Revenue_Millions,Metascore,Year,Runtime_Minutes
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Action,Adventure,Sci-Fi",336.2,18582076,10461.51,2756.0,100623,6368
"Animation,Adventure,Comedy",194.4,5913065,5754.75,1787.0,54352,2619
"Action,Adventure,Fantasy",170.9,7816851,5248.29,1300.0,54317,3190
"Adventure,Family,Fantasy",93.8,2640649,2201.47,787.0,28165,1715
Comedy,193.0,3685529,1941.81,1458.0,64440,3229
...,...,...,...,...,...,...
"Animation,Drama,Romance",8.4,2421,0.00,80.0,2016,129
"Comedy,Sci-Fi",6.0,26587,0.00,31.0,2015,85
"Comedy,Western",4.8,31149,0.00,18.0,2015,119
"Drama,Family",8.1,177602,0.00,61.0,2009,93


In [5]:

## testing connection to send email
import smtplib, ssl

smtp_server = 'smtp.gmail.com'
port = 465

sender = 'jreay.data.eng@gmail.com'
password_s = 'omnkaiawatqvatua'

context = ssl.create_default_context()

with smtplib.SMTP_SSL(smtp_server, port, context= context) as server:
    server.login(sender, password_s)
    print('It worked!')

It worked!


In [15]:
## send tester email without attachment 

import smtplib, ssl
smtp_port = config.get('stmplib', 'smtp_port')
smtp_server = config.get('stmplib', 'smtp_server')
sender_email = config.get('stmplib', 'smtp_sender_email')
receiver_email = config.get('stmplib', 'smtp_receiver_email')
smtp_password = config.get('stmplib', 'smtp_password')
message = """\
Subject: Hi there 3
Im sending an email through python code."""
context = ssl.create_default_context()
with smtplib.SMTP(smtp_server, smtp_port) as server:
    server.ehlo() 
    server.starttls(context=context)
    server.ehlo() 
    server.login(sender_email, smtp_password)
    server.sendmail(sender_email, receiver_email, message)

In [None]:
##to do

# import smtplib
# from email.mime.text import MIMEText
# from email.mime.multipart import MIMEMultipart
# from email.mime.base import MIMEBase
# from email import encoders
# import os.path

# email = 'your@gmail.com'
# password = '***********'
# send_to_email = 'others@gmail.com'
# subject = 'Sending Email with an attachment'
# message = 'Please find the attachment to email, thanks'
# file_location = 'C:\\Users\\DELL E7440\\Desktop\\pyspark - Copy.txt'

# msg = MIMEMultipart()
# msg['From'] = email
# msg['To'] = send_to_email
# msg['Subject'] = subject

# msg.attach(MIMEText(message, 'plain'))

# # Setup the attachment
# filename = os.path.basename(file_location)
# attachment = open(file_location, "rb")
# part = MIMEBase('application', 'octet-stream')
# part.set_payload(attachment.read())
# encoders.encode_base64(part)
# part.add_header('Content-Disposition', "attachment; filename= %s" % filename)

# # Attach the attachment to the MIMEMultipart object
# msg.attach(part)

# server = smtplib.SMTP('smtp.gmail.com', 587)
# server.starttls()
# server.login(email, password)
# text = msg.as_string()
# server.sendmail(email, send_to_email, text)
# server.quit()