In [1]:
import pandas as pd
from IPython.display import display
import mysql.connector
import os
from dotenv import load_dotenv
import pymysql
from sqlalchemy import create_engine
import firebase_admin
from firebase_admin import credentials, storage
from datetime import datetime
from google.cloud import bigquery

In [2]:
load_dotenv()

True

# Extract Data

In [3]:
raw_data = []
raw_data.clear()

def ingestion(table):
    load_dotenv()
    connection = mysql.connector.connect(
    host=os.getenv('host'),
    user='root',
    password=os.getenv('password'),
    database='capstone5'
    )
    query = f"SELECT * FROM {table}"  
    df = pd.read_sql(query, connection)
    connection.close()

    output_path = os.path.join('Data_Loaded', 'Raw_Data', f'{table}.csv')
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)

    raw_data.append(output_path)
    return df

In [4]:
df_applications = ingestion('applications')
df_articles = ingestion('articles')
df_comments = ingestion('comments')
df_donations = ingestion('donations_manual')
df_donation_comments = ingestion('donation_comments')
df_fundraising_categories = ingestion('fundraising_categories')
df_fundraisings = ingestion('fundraisings')
df_users = ingestion('users')
df_likes_comments = ingestion('likes_comments')
df_organizations = ingestion('organizations')
df_testimoni_volunteers = ingestion('testimoni_volunteers')
df_bookmark_fundraising = ingestion('user_bookmark_fundraising')
df_bookmark_volunteer = ingestion('user_bookmark_volunteer_vacancies')
df_volunteer_vacancies = ingestion('volunteers')

  df = pd.read_sql(query, connection)


# Transform

### Handle Missing Value & Duplicates

In [5]:
def cleaning_data(df, name_df):
    # cek duplikat
    duplicates = df[df.duplicated(subset=df.columns, keep=False)]
    if not duplicates.empty:
        print(f"Terdapat duplikat pada {name_df}")
        print(duplicates)
        df = df.drop_duplicates()
    else:
        print(f"Tidak ada data duplikat pada {name_df}")
    
    # cek missing value
    missing_columns = [col for col in df.columns if col not in ['created_at','updated_at', 'deleted_at']]
    df[missing_columns] = df[missing_columns].replace('', None)
    if df[missing_columns].isnull().any().any():
        print(f"Terdapat missing value dalam {name_df}")
        missing_sum = df[missing_columns].isnull().sum()
        print(missing_sum)
        
        for col in missing_columns:
            if df[col].dtype != 'object' and df[col].dtype != 'datetime64[ns]':
                df[col] = df[col].fillna(0)
            elif df[col].dtype == 'datetime64[ns]':
                df[col] = df[col].fillna(pd.NaT)
            else:
                df[col] = df[col].fillna('-')
    else :
        print(f"Tidak ada missing value dalam {name_df}")
        missing_sum = df[missing_columns].isnull().sum()
        print(missing_sum)
    return df

##### Check Missing Value & Duplicate

In [6]:
df_applications = cleaning_data(df_applications, "Application")
df_articles = cleaning_data(df_articles, "Article")
df_comments = cleaning_data(df_comments, "Comment")
df_donations = cleaning_data(df_donations, "Donation")
df_donation_comments = cleaning_data(df_donation_comments, "Donation_Comments")
df_fundraising_categories = cleaning_data(df_fundraising_categories, "Fundraising_Categories")
df_fundraisings = cleaning_data(df_fundraisings, "Fundraising")
df_users = cleaning_data(df_users, "Customer")
df_likes_comments = cleaning_data(df_likes_comments, "Like_Comments")
df_organizations = cleaning_data(df_organizations, "Organization")
df_testimoni_volunteers = cleaning_data(df_testimoni_volunteers, "Testimoni_Volunteer")
df_bookmark_fundraising = cleaning_data(df_bookmark_fundraising, "Bookmark_Fundraising")
df_bookmark_volunteer = cleaning_data(df_bookmark_volunteer, "Bookmark_Volunteer")
df_volunteer_vacancies = cleaning_data(df_volunteer_vacancies, "Volunteer_Vacancies")

Tidak ada data duplikat pada Application
Terdapat missing value dalam Application
id                0
ig_image_url      0
yt_image_url      0
user_id           0
vacancy_id        0
job               0
reason            0
age             123
dtype: int64
Tidak ada data duplikat pada Article
Terdapat missing value dalam Article
id           0
admin_id     1
title        0
content      0
image_url    0
dtype: int64
Tidak ada data duplikat pada Comment
Terdapat missing value dalam Comment
id             0
user_id        0
article_id     0
comment        0
total_likes    1
dtype: int64
Tidak ada data duplikat pada Donation
Tidak ada missing value dalam Donation
id                0
user_id           0
amount            0
status            0
bukti_tf_url      0
fundraising_id    0
dtype: int64
Tidak ada data duplikat pada Donation_Comments
Tidak ada missing value dalam Donation_Comments
id             0
donation_id    0
comment        0
total_likes    0
dtype: int64
Tidak ada data duplikat p

### Tabel Fakta

##### 1. FactDonationTransaction

In [7]:
# Buat Struktur kolom df_fact_donation_transaction
columns = ['id', 'donation_id', 'fundraising_id', 'user_id', 'amount', 'goal_amount', 'fundraising_category_id', 'organization_id', 'donation_created_at']
df_fact_donation = pd.DataFrame(columns=columns)

In [8]:
# mengambil data yang sukses di df_donation 
df_donation_success = df_donations.loc[df_donations['status'] == 'sukses']
df_donation_success = df_donation_success.reset_index(drop=True)

In [9]:
# mengisi data pada dari kolom df_donation
df_fact_donation['id'] = range(1, len(df_donation_success) + 1)
df_fact_donation['donation_id'] = df_donation_success['id']
df_fact_donation['fundraising_id'] = df_donation_success['fundraising_id']
df_fact_donation['user_id'] = df_donation_success['user_id']
df_fact_donation['amount'] = df_donation_success['amount']
df_fact_donation['donation_created_at'] = df_donation_success['created_at']

In [10]:
# merge df_fundraising
df_merge_fact_fundraising = pd.merge(df_fact_donation, df_fundraisings, left_on='fundraising_id', right_on='id', how='left')
df_fact_donation['goal_amount'] = df_merge_fact_fundraising['goal_amount_y']
df_fact_donation['fundraising_category_id'] = df_merge_fact_fundraising['fundraising_category_id_y']
df_fact_donation['organization_id'] = df_merge_fact_fundraising['organization_id_y']
df_fact_donation

Unnamed: 0,id,donation_id,fundraising_id,user_id,amount,goal_amount,fundraising_category_id,organization_id,donation_created_at
0,1,16,10,51,900000,6000000,1,7,2024-06-08 19:33:48.384
1,2,17,6,52,2000000,5000000,1,7,2024-06-08 19:33:48.384
2,3,18,11,53,400000,25000000,1,9,2024-06-08 19:33:48.384
3,4,19,7,54,30000,10000000,1,8,2024-06-08 19:33:48.384
4,5,20,28,55,2400000,5000000,4,11,2024-06-08 19:33:48.384
...,...,...,...,...,...,...,...,...,...
82,83,98,14,65,150000,8000000,2,13,2024-06-08 19:33:48.384
83,84,99,14,66,15000,8000000,2,13,2024-06-08 19:33:48.384
84,85,100,15,64,550000,6000000,2,13,2024-06-08 19:33:48.384
85,86,101,15,85,15000,6000000,2,13,2024-06-08 19:33:48.384


##### 2. Fact Volunteer Applications

In [21]:
# Buat Struktur kolom df_fact_volunteer_applications
columns = ['id', 'application_id', 'vacancy_id', 'user_id', 'organization_id', 'application_created_at']
df_fact_applications = pd.DataFrame(columns=columns)

In [22]:
# mengisi data pada dari kolom df_application
df_fact_applications['id'] = range(1, len(df_applications) + 1)
df_fact_applications['application_id'] = df_applications['id']
df_fact_applications['vacancy_id'] = df_applications['vacancy_id']
df_fact_applications['user_id'] = df_applications['user_id']
df_fact_applications['application_created_at'] = df_applications['created_at']

In [23]:
# merge df_fundraising
df_merge_fact_volunteer = pd.merge(df_fact_applications, df_volunteer_vacancies, left_on='vacancy_id', right_on='id', how='left')
df_fact_applications['organization_id'] = df_merge_fact_volunteer['organization_id_y']
df_fact_applications

Unnamed: 0,id,application_id,vacancy_id,user_id,organization_id,application_created_at
0,1,1,3,54,11,2024-06-07 13:08:19.244
1,2,4,1,54,11,2024-06-08 13:08:19.244
2,3,5,1,55,11,NaT
3,4,6,1,56,11,NaT
4,5,7,1,58,11,NaT
...,...,...,...,...,...,...
121,122,159,31,55,11,NaT
122,123,160,31,56,11,NaT
123,124,161,31,58,11,NaT
124,125,162,31,59,11,NaT


##### 3. Fact Volunteer Testimoni

In [11]:
# Buat Struktur kolom df_fact_volunteer_testimoni
columns = ['id', 'user_id', 'vacancy_id', 'testimoni_volunteer_id', 'rating']
df_fact_volunteer_testimoni = pd.DataFrame(columns=columns)

In [12]:
# mengisi data pada dari kolom df_application
df_fact_volunteer_testimoni['id'] = range(1, len(df_testimoni_volunteers) + 1)
df_fact_volunteer_testimoni['user_id'] = df_testimoni_volunteers['user_id']
df_fact_volunteer_testimoni['vacancy_id'] = df_testimoni_volunteers['vacancy_id']
df_fact_volunteer_testimoni['testimoni_volunteer_id'] = df_testimoni_volunteers['id']
df_fact_volunteer_testimoni['rating'] = df_testimoni_volunteers['rating']
df_fact_volunteer_testimoni

Unnamed: 0,id,user_id,vacancy_id,testimoni_volunteer_id,rating
0,1,54,1,1,4
1,2,55,1,2,3
2,3,56,1,3,4
3,4,58,1,4,3
4,5,54,2,5,4
5,6,55,2,6,5
6,7,56,2,7,4
7,8,58,2,8,5
8,9,67,5,9,4
9,10,68,5,10,4


##### 4. Fact Article Popular

In [25]:
# Buat Struktur kolom df_fact_articel_popular
columns = ['id', 'article_id', 'comment_id', 'user_id', 'comment_created_at']
df_fact_article_popular = pd.DataFrame(columns=columns)

In [26]:
# mengisi data pada dari kolom df_comment
df_fact_article_popular['id'] = range(1, len(df_comments) + 1)
df_fact_article_popular['article_id'] = df_comments['article_id']
df_fact_article_popular['comment_id'] = df_comments['id']
df_fact_article_popular['user_id'] = df_comments['user_id']
df_fact_article_popular['comment_created_at'] = df_comments['created_at']
df_fact_article_popular

Unnamed: 0,id,article_id,comment_id,user_id,comment_created_at
0,1,32,1,73,2024-06-09 05:57:01.302
1,2,32,2,73,2024-06-09 13:20:06.191


##### 5. Fact Bookmark Fundraising

In [27]:
df_fact_bookmark_fundraising = df_bookmark_fundraising.drop(['deleted_at', 'updated_at'], axis=1)
df_fact_bookmark_fundraising = df_fact_bookmark_fundraising.rename(columns={'id':'bookmark_id'})
df_fact_bookmark_fundraising['id'] = range(1, len(df_fact_bookmark_fundraising) + 1)
df_fact_bookmark_fundraising.insert(0, 'id', df_fact_bookmark_fundraising.pop('id'))
df_fact_bookmark_fundraising

Unnamed: 0,id,bookmark_id,created_at,user_id,fundraising_id
0,1,16,2024-06-06 20:43:56.908,61,6
1,2,17,2024-06-06 20:43:56.908,62,7
2,3,18,2024-06-06 20:43:56.908,63,10
3,4,19,2024-06-06 20:43:56.908,64,12
4,5,20,2024-06-06 20:43:56.908,65,11
5,6,21,2024-06-07 18:16:42.151,81,1
6,7,22,2024-06-07 18:16:42.151,72,2
7,8,23,2024-06-07 18:16:42.151,53,3
8,9,24,2024-06-07 18:16:42.151,54,9
9,10,25,2024-06-07 18:16:42.151,55,8


##### 6. Fact Bookmark Volunteer Vacancies

In [28]:
df_fact_bookmark_volunteer_vacancies = df_bookmark_volunteer.drop(['deleted_at', 'updated_at'], axis=1)
df_fact_bookmark_volunteer_vacancies = df_fact_bookmark_volunteer_vacancies.rename(columns={'id':'bookmark_id'})
df_fact_bookmark_volunteer_vacancies['id'] = range(1, len(df_fact_bookmark_volunteer_vacancies) + 1)
df_fact_bookmark_volunteer_vacancies.insert(0, 'id', df_fact_bookmark_volunteer_vacancies.pop('id'))
df_fact_bookmark_volunteer_vacancies

Unnamed: 0,id,bookmark_id,created_at,user_id,volunteer_vacancies_id
0,1,16,2024-06-07 07:19:52.737,1,2
1,2,17,2024-06-07 07:19:52.737,52,4
2,3,18,2024-06-07 07:19:52.737,53,6
3,4,19,2024-06-07 07:19:52.737,54,3
4,5,20,2024-06-07 07:19:52.737,75,5
5,6,21,2024-06-07 18:27:48.688,61,1
6,7,22,2024-06-07 18:27:48.688,52,5
7,8,23,2024-06-07 18:27:48.688,73,7
8,9,24,2024-06-07 18:27:48.688,64,8
9,10,25,2024-06-07 18:27:48.688,65,9


### Tabel Dimensi

In [29]:
dim_fundraisings = df_fundraisings.drop(['fundraising_category_id','organization_id','updated_at'], axis=1)
dim_fundraising_categories = df_fundraising_categories[['id','name']]
dim_donation = df_donations.drop(['fundraising_id', 'user_id','updated_at'], axis=1)
dim_organization = df_organizations.drop(['updated_at'], axis= 1)
dim_user = df_users.drop(['updated_at'], axis=1)
dim_volunteer_applictaion = df_applications.drop(['user_id','vacancy_id','updated_at'], axis=1)
dim_volunteer_vacancies = df_volunteer_vacancies.drop(['organization_id','updated_at'], axis=1)
dim_testimoni_volunteer = df_testimoni_volunteers.drop(['user_id','vacancy_id','updated_at'], axis=1)
dim_article = df_articles.drop(['updated_at'], axis = 1)
dim_comment = df_comments.drop(['user_id','article_id','updated_at'], axis=1)
dim_bookmark_fundraising = df_bookmark_fundraising.drop(['fundraising_id','user_id','updated_at'], axis=1)
dim_bookmark_volunter_vacancies = df_bookmark_volunteer.drop(['volunteer_vacancies_id','user_id','updated_at'], axis=1)

# Load

##### 1. Load To CSV

In [30]:
df_fact_donation.to_csv("Data_Loaded/Fact/fact_donation_transaction.csv", index=False)
df_fact_applications.to_csv("Data_Loaded/Fact/fact_volunteer_applications.csv", index=False)
df_fact_volunteer_testimoni.to_csv("Data_Loaded/Fact/fact_volunteer_testimoni.csv", index=False)
df_fact_article_popular.to_csv("Data_Loaded/Fact/fact_article_popular.csv", index=False)
df_fact_bookmark_fundraising.to_csv("Data_Loaded/Fact/fact_bookmark_fundraising.csv", index=False)
df_fact_bookmark_volunteer_vacancies.to_csv("Data_Loaded/Fact/fact_bookmark_volunteer_vacancies.csv", index=False)
dim_article.to_csv("Data_Loaded/Dimensi/dim_article.csv", index=False)
dim_bookmark_fundraising.to_csv("Data_Loaded/Dimensi/dim_bookmark_fundraising.csv", index=False)
dim_bookmark_volunter_vacancies.to_csv("Data_Loaded/Dimensi/dim_bookmark_volunteer_vacancies.csv", index=False)
dim_comment.to_csv("Data_Loaded/Dimensi/dim_comment.csv", index=False)
dim_donation.to_csv("Data_Loaded/Dimensi/dim_donation.csv", index=False)
dim_fundraising_categories.to_csv("Data_Loaded/Dimensi/dim_fundraising_categories.csv", index=False)
dim_fundraisings.to_csv("Data_Loaded/Dimensi/dim_fundraising.csv", index=False)
dim_user.to_csv("Data_Loaded/Dimensi/dim_user.csv", index=False)
dim_testimoni_volunteer.to_csv("Data_Loaded/Dimensi/dim_testimoni_volunteer.csv", index=False)
dim_volunteer_vacancies.to_csv("Data_Loaded/Dimensi/dim_volunteer_vacancies.csv", index=False)
dim_volunteer_applictaion.to_csv("Data_Loaded/Dimensi/dim_volunteer_application.csv", index=False)
dim_organization.to_csv("Data_Loaded/Dimensi/dim_organization.csv", index=False)

##### 2. Load To DB Local

In [31]:
def load_db_local(df, table_name):
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='',
        port=int(os.getenv('port'))
    )

    cursor = connection.cursor()

    cursor.execute('CREATE DATABASE IF NOT EXISTS peduli_pintar')
    cursor.execute('USE peduli_pintar')

    engine = create_engine('mysql+pymysql://root:@localhost:3307/peduli_pintar')
    df.to_sql(name=table_name, con=engine, if_exists='replace', index=False)
    connection.close()

In [32]:
load_db_local(df_fact_applications, 'fact_applications')
load_db_local(df_fact_article_popular, 'fact_article_popular')
load_db_local(df_fact_bookmark_fundraising, 'fact_bookmark_fundraising')
load_db_local(df_fact_bookmark_volunteer_vacancies, 'fact_bookmark_volunteer_vacancies')
load_db_local(df_fact_donation, 'fact_donation')
load_db_local(df_fact_volunteer_testimoni, 'fact_volunteer_testimoni')
load_db_local(dim_fundraisings, 'dim_fundraisings')
load_db_local(dim_fundraising_categories, 'dim_fundraising_categories')
load_db_local(dim_donation, 'dim_donation')
load_db_local(dim_organization, 'df_organization')
load_db_local(dim_user, 'dim_user')
load_db_local(dim_volunteer_applictaion, 'dim_volunteer_applictaion')
load_db_local(dim_volunteer_vacancies, 'dim_volunteer_vacancies')
load_db_local(dim_testimoni_volunteer, 'dim_testimoni_volunteer')
load_db_local(dim_article, 'dim_article')
load_db_local(dim_comment, 'dim_comment')
load_db_local(dim_bookmark_fundraising, 'dim_bookmark_fundraising')
load_db_local(dim_bookmark_volunter_vacancies, 'dim_bookmark_volunter_vacancies')

##### 3. Load To Cloud Storage (Firebase)

In [33]:
def initialize_firebase():
    if not firebase_admin._apps:
        # load_dotenv()
        credentials_path = os.getenv('FIREBASE_CREDENTIALS_PATH')
        cred = credentials.Certificate(credentials_path)
        firebase_admin.initialize_app(cred)
        print("Firebase has been initialized")
    else:
        print("Firebase is already initialized")
    bucket_name = os.getenv('BUCKET_NAME')
    return storage.bucket(bucket_name)

In [34]:
def upload_files_to_firebase(files):
    bucket = initialize_firebase()
    for file in files:
        file_name = os.path.basename(file)
        file_ref = bucket.blob(file_name)
        file_ref.upload_from_filename(file)
        print(f"File {file_name} uploaded successfully!")

In [35]:
def upload_files_to_firebase(files):
    bucket = initialize_firebase()
    for file in files:
        current_date = datetime.now().strftime("%Y%m%d")
        file_name = os.path.basename(file)
        file_name_with_date = f"{file_name}_{current_date}"
        
        file_ref = bucket.blob(file_name_with_date)
        file_ref.upload_from_filename(file)
        print(f"File {file_name_with_date} uploaded successfully!")

In [36]:
upload_files_to_firebase(raw_data)

Firebase has been initialized
File applications.csv_20240612 uploaded successfully!
File articles.csv_20240612 uploaded successfully!
File comments.csv_20240612 uploaded successfully!
File donations_manual.csv_20240612 uploaded successfully!
File donation_comments.csv_20240612 uploaded successfully!
File fundraising_categories.csv_20240612 uploaded successfully!
File fundraisings.csv_20240612 uploaded successfully!
File users.csv_20240612 uploaded successfully!
File likes_comments.csv_20240612 uploaded successfully!
File organizations.csv_20240612 uploaded successfully!
File testimoni_volunteers.csv_20240612 uploaded successfully!
File user_bookmark_fundraising.csv_20240612 uploaded successfully!
File user_bookmark_volunteer_vacancies.csv_20240612 uploaded successfully!
File volunteers.csv_20240612 uploaded successfully!


##### 4. Load To Bigquery

In [37]:
csv_files_fact_and_dim = [
    'fact_article_popular.csv',
    'fact_bookmark_fundraising.csv',
    'fact_bookmark_volunteer_vacancies.csv',
    'fact_donation_transaction.csv',
    'fact_volunteer_applications.csv',
    'fact_volunteer_testimoni.csv',
    'dim_article.csv',
    'dim_bookmark_fundraising.csv',
    'dim_bookmark_volunteer_vacancies.csv',
    'dim_comment.csv',
    'dim_donation.csv',
    'dim_fundraising_categories.csv',
    'dim_fundraising.csv',
    'dim_organization.csv',
    'dim_testimoni_volunteer.csv',
    'dim_user.csv',
    'dim_volunteer_application.csv',
    'dim_volunteer_vacancies.csv'
]

In [38]:
credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
if credentials_path:
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
else:
    raise Exception("GOOGLE_APPLICATION_CREDENTIALS is not set in the .env file")

client = bigquery.Client()

dataset_id = os.getenv("dataset_id")
data_folder = "Data_Loaded/Fact/"

In [39]:
def load_csv_to_bigquery(file_name):
    table_id = f"{dataset_id}.{file_name.split('.')[0]}"
    file_path = os.path.join(data_folder, file_name)

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )

    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)
    
    job.result()

    table = client.get_table(table_id)
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )

In [40]:
for csv_file in csv_files_fact_and_dim:
    try:
        load_csv_to_bigquery(csv_file)
    except Exception as e:
        print(f"Failed to load {csv_file}: {e}")

Loaded 2 rows and 5 columns to peduli_pintar.fact_article_popular
Loaded 25 rows and 5 columns to peduli_pintar.fact_bookmark_fundraising
Loaded 21 rows and 5 columns to peduli_pintar.fact_bookmark_volunteer_vacancies
Loaded 87 rows and 9 columns to peduli_pintar.fact_donation_transaction
Loaded 126 rows and 6 columns to peduli_pintar.fact_volunteer_applications
Loaded 24 rows and 5 columns to peduli_pintar.fact_volunteer_testimoni
Failed to load dim_article.csv: [Errno 2] No such file or directory: 'Data_Loaded/Fact/dim_article.csv'
Failed to load dim_bookmark_fundraising.csv: [Errno 2] No such file or directory: 'Data_Loaded/Fact/dim_bookmark_fundraising.csv'
Failed to load dim_bookmark_volunteer_vacancies.csv: [Errno 2] No such file or directory: 'Data_Loaded/Fact/dim_bookmark_volunteer_vacancies.csv'
Failed to load dim_comment.csv: [Errno 2] No such file or directory: 'Data_Loaded/Fact/dim_comment.csv'
Failed to load dim_donation.csv: [Errno 2] No such file or directory: 'Data_Load