In [46]:
import pandas as pd
from IPython.display import display
import mysql.connector
import os
from dotenv import load_dotenv
import pymysql
from sqlalchemy import create_engine

# Extract Data

In [47]:
def ingestion(table):
    load_dotenv()
    connection = mysql.connector.connect(
    host=os.getenv('host'),
    user='root',
    password=os.getenv('password'),
    database='capstone5'
    )
    query = f"SELECT * FROM {table}"  
    df = pd.read_sql(query, connection)
    connection.close()
    return df

In [48]:
df_applications = ingestion('applications')
df_articles = ingestion('articles')
df_comments = ingestion('comments')
df_donations = ingestion('donations_manual')
df_donation_comments = ingestion('donation_comments')
df_fundraising_categories = ingestion('fundraising_categories')
df_fundraisings = ingestion('fundraisings')
df_users = ingestion('users')
df_likes_comments = ingestion('likes_comments')
df_organizations = ingestion('organizations')
df_testimoni_volunteers = ingestion('testimoni_volunteers')
df_bookmark_fundraising = ingestion('user_bookmark_fundraising')
df_bookmark_volunteer = ingestion('user_bookmark_volunteer_vacancies')
df_volunteer_vacancies = ingestion('volunteers')

  df = pd.read_sql(query, connection)


In [None]:
display(df_applications)
display(df_articles)
display(df_comments)
display(df_donations)
display(df_donation_comments)
display(df_fundraising_categories)
display(df_fundraisings)
display(df_users)
display(df_likes_comments)
display(df_testimoni_volunteers)
display(df_volunteer_vacancies)

# Transform

### Handle Missing Value & Duplicates

In [49]:
def cleaning_data(df, name_df):
    
    # cek duplikat
    duplicates = df[df.duplicated(subset=df.columns, keep=False)]
    if not duplicates.empty:
        print(f"Terdapat duplikat pada {name_df}")
        print(duplicates)
        df = df.drop_duplicates()
    else:
        print(f"Tidak ada data duplikat pada {name_df}")
    
    # cek missing value
    missing_columns = [col for col in df.columns if col not in ['created_at','updated_at', 'deleted_at']]
    df[missing_columns] = df[missing_columns].replace('', None)
    if df[missing_columns].isnull().any().any():
        print(f"Terdapat missing value dalam {name_df}")
        missing_sum = df[missing_columns].isnull().sum()
        print(missing_sum)
        
        for col in missing_columns:
            if df[col].dtype != 'object' and df[col].dtype != 'datetime64[ns]':
                df[col] = df[col].fillna(0)
            elif df[col].dtype == 'datetime64[ns]':
                df[col] = df[col].fillna(pd.NaT)
            else:
                df[col] = df[col].fillna('-')
    else :
        print(f"Tidak ada missing value dalam {name_df}")
        missing_sum = df[missing_columns].isnull().sum()
        print(missing_sum)
    return df

##### Check Missing Value & Duplicate

In [50]:
df_applications = cleaning_data(df_applications, "Application Data")
df_articles = cleaning_data(df_articles, "Article Data")
df_comments = cleaning_data(df_comments, "Comment Data")
df_donations = cleaning_data(df_donations, "Donation Data")
df_donation_comments = cleaning_data(df_donation_comments, "Donation Comments Data")
df_fundraising_categories = cleaning_data(df_fundraising_categories, "Fundraising Categories Data")
df_fundraisings = cleaning_data(df_fundraisings, "Fundraising Data")

df_users = cleaning_data(df_users, "Customer Data")
df_likes_comments = cleaning_data(df_likes_comments, "Like Comments Data")
df_organizations = cleaning_data(df_organizations, "Organization Data")
df_testimoni_volunteers = cleaning_data(df_testimoni_volunteers, "Testimoni Volunteer Data")
df_bookmark_fundraising = cleaning_data(df_bookmark_fundraising, "Bookmark Fundraising Data")
df_bookmark_volunteer = cleaning_data(df_bookmark_volunteer, "Bookmark Volunteer Data")
df_volunteer_vacancies = cleaning_data(df_volunteer_vacancies, "Volunteer Vacancies Data")

Tidak ada data duplikat pada Application Data
Terdapat missing value dalam Application Data
id                0
ig_image_url      0
yt_image_url      0
user_id           0
vacancy_id        0
job               0
reason            0
age             123
dtype: int64
Tidak ada data duplikat pada Article Data
Terdapat missing value dalam Article Data
id           0
admin_id     1
title        0
content      0
image_url    0
dtype: int64
Tidak ada data duplikat pada Comment Data
Terdapat missing value dalam Comment Data
id             0
user_id        0
article_id     0
comment        0
total_likes    1
dtype: int64
Tidak ada data duplikat pada Donation Data
Tidak ada missing value dalam Donation Data
id                0
user_id           0
amount            0
status            0
bukti_tf_url      0
fundraising_id    0
dtype: int64
Tidak ada data duplikat pada Donation Comments Data
Tidak ada missing value dalam Donation Comments Data
id             0
donation_id    0
comment        0
total

### Tabel Fakta

##### 1. FactDonationTransaction

In [51]:
# Buat Struktur kolom df_fact_donation_transaction
columns = ['id', 'donation_id', 'fundraising_id', 'user_id', 'amount', 'goal_amount', 'fundraising_category_id', 'organization_id', 'donation_created_at']
df_fact_donation = pd.DataFrame(columns=columns)

In [52]:
# mengambil data yang sukses di df_donation 
df_donation_success = df_donations.loc[df_donations['status'] == 'sukses']
df_donation_success = df_donation_success.reset_index(drop=True)

In [53]:
# mengisi data pada dari kolom df_donation
df_fact_donation['id'] = range(1, len(df_donation_success) + 1)
df_fact_donation['donation_id'] = df_donation_success['id']
df_fact_donation['fundraising_id'] = df_donation_success['fundraising_id']
df_fact_donation['user_id'] = df_donation_success['user_id']
df_fact_donation['amount'] = df_donation_success['amount']
df_fact_donation['donation_created_at'] = df_donation_success['created_at']

In [58]:
# merge df_fundraising
df_merge_fact_fundraising = pd.merge(df_fact_donation, df_fundraisings, left_on='fundraising_id', right_on='id', how='left')
df_fact_donation['goal_amount'] = df_merge_fact_fundraising['goal_amount_y']
df_fact_donation['fundraising_category_id'] = df_merge_fact_fundraising['fundraising_category_id_y']
df_fact_donation['organization_id'] = df_merge_fact_fundraising['organization_id_y']
df_fact_donation

Unnamed: 0,id,donation_id,fundraising_id,user_id,amount,goal_amount,fundraising_category_id,organization_id,donation_created_at
0,1,16,10,51,900000,6000000,1,7,2024-06-08 19:33:48.384
1,2,17,6,52,2000000,5000000,1,7,2024-06-08 19:33:48.384
2,3,18,11,53,400000,25000000,1,9,2024-06-08 19:33:48.384
3,4,19,7,54,30000,10000000,1,8,2024-06-08 19:33:48.384
4,5,20,28,55,2400000,5000000,4,11,2024-06-08 19:33:48.384
...,...,...,...,...,...,...,...,...,...
82,83,98,14,65,150000,8000000,2,13,2024-06-08 19:33:48.384
83,84,99,14,66,15000,8000000,2,13,2024-06-08 19:33:48.384
84,85,100,15,64,550000,6000000,2,13,2024-06-08 19:33:48.384
85,86,101,15,85,15000,6000000,2,13,2024-06-08 19:33:48.384


##### 2. Fact Volunteer Applications

In [55]:
# Buat Struktur kolom df_fact_volunteer_applications
columns = ['id', 'application_id', 'vacancy_id', 'user_id', 'organization_id', 'application_created_at']
df_fact_applications = pd.DataFrame(columns=columns)

In [56]:
# mengisi data pada dari kolom df_application
df_fact_applications['id'] = range(1, len(df_applications) + 1)
df_fact_applications['application_id'] = df_applications['id']
df_fact_applications['vacancy_id'] = df_applications['vacancy_id']
df_fact_applications['user_id'] = df_applications['user_id']
df_fact_applications['application_created_at'] = df_applications['created_at']

In [57]:
# merge df_fundraising
df_merge_fact_volunteer = pd.merge(df_fact_applications, df_volunteer_vacancies, left_on='vacancy_id', right_on='id', how='left')
df_fact_applications['organization_id'] = df_merge_fact_volunteer['organization_id_y']
df_fact_applications

Unnamed: 0,id,application_id,vacancy_id,user_id,organization_id,application_created_at
0,1,1,3,54,11,2024-06-07 13:08:19.244
1,2,4,1,54,11,2024-06-08 13:08:19.244
2,3,5,1,55,11,NaT
3,4,6,1,56,11,NaT
4,5,7,1,58,11,NaT
...,...,...,...,...,...,...
121,122,159,31,55,11,NaT
122,123,160,31,56,11,NaT
123,124,161,31,58,11,NaT
124,125,162,31,59,11,NaT


##### 3. Fact Volunteer Testimoni

In [59]:
# Buat Struktur kolom df_fact_volunteer_testimoni
columns = ['id', 'user_id', 'vacancy_id', 'testimoni_volunteer_id', 'rating']
df_fact_volunteer_testimoni = pd.DataFrame(columns=columns)

In [61]:
# mengisi data pada dari kolom df_application
df_fact_volunteer_testimoni['id'] = range(1, len(df_applications) + 1)
df_fact_volunteer_testimoni['user_id'] = df_testimoni_volunteers['user_id']
df_fact_volunteer_testimoni['vacancy_id'] = df_testimoni_volunteers['vacancy_id']
df_fact_volunteer_testimoni['testimoni_volunteer_id'] = df_testimoni_volunteers['id']
df_fact_volunteer_testimoni['rating'] = df_testimoni_volunteers['rating']
df_fact_volunteer_testimoni.to_csv('cek.csv')

##### 4. Fact Article Popular

In [None]:
# Buat Struktur kolom df_fact_articel_popular
columns = ['id', 'article_id', 'comment_id', 'user_id', 'comment_created_at']
df_fact_article_popular = pd.DataFrame(columns=columns)

In [None]:
# mengisi data pada dari kolom df_comment
df_fact_article_popular['id'] = range(1, len(df_comments) + 1)
df_fact_article_popular['article_id'] = df_comments['article_id']
df_fact_article_popular['comment_id'] = df_comments['id']
df_fact_article_popular['user_id'] = df_comments['user_id']
df_fact_article_popular['comment_created_at'] = df_comments['created_at']
df_fact_article_popular

##### 5. Fact Bookmark Fundraising

In [None]:
df_fact_bookmark_fundraising = df_bookmark_fundraising.drop(['deleted_at', 'updated_at'], axis=1)
df_fact_bookmark_fundraising = df_fact_bookmark_fundraising.rename(columns={'id':'bookmark_id'})
df_fact_bookmark_fundraising['id'] = range(1, len(df_fact_bookmark_fundraising) + 1)
df_fact_bookmark_fundraising.insert(0, 'id', df_fact_bookmark_fundraising.pop('id'))
df_fact_bookmark_fundraising

##### 6. Fact Bookmark Volunteer Vacancies

In [None]:
df_fact_bookmark_volunteer_vacancies = df_bookmark_volunteer.drop(['deleted_at', 'updated_at'], axis=1)
df_fact_bookmark_volunteer_vacancies = df_fact_bookmark_volunteer_vacancies.rename(columns={'id':'bookmark_id'})
df_fact_bookmark_volunteer_vacancies['id'] = range(1, len(df_fact_bookmark_volunteer_vacancies) + 1)
df_fact_bookmark_volunteer_vacancies.insert(0, 'id', df_fact_bookmark_volunteer_vacancies.pop('id'))
df_fact_bookmark_volunteer_vacancies

### Tabel Dimensi

In [None]:
dim_fundraisings = df_fundraisings.drop(['fundraising_category_id','organization_id','updated_at'], axis=1)
dim_fundraising_categories = df_fundraising_categories[['id','name']]
dim_donation = df_donations.drop(['fundraising_id', 'user_id','updated_at'], axis=1)
dim_organization = df_organizations.drop(['updated_at'], axis= 1)
dim_user = df_users.drop(['updated_at'], axis=1)
dim_volunteer_applictaion = df_applications.drop(['user_id','vacancy_id','updated_at'], axis=1)
dim_volunteer_vacancies = df_volunteer_vacancies.drop(['organization_id','updated_at'], axis=1)
dim_testimoni_volunteer = df_testimoni_volunteers.drop(['user_id','vacancy_id','updated_at'], axis=1)
dim_article = df_articles.drop(['updated_at'], axis = 1)
dim_comment = df_comments.drop(['user_id','article_id','updated_at'], axis=1)
dim_bookmark_fundraising = df_bookmark_fundraising.drop(['fundraising_id','user_id','updated_at'], axis=1)
dim_bookmark_volunter_vacancies = df_bookmark_volunteer.drop(['volunteer_vacancies_id','user_id','updated_at'], axis=1)

# Load

##### 1. Load To CSV

In [None]:
df_fact_donation.to_csv("Data_Loaded/Fact/fact_donation_transaction.csv", index=False)
df_fact_applications.to_csv("Data_Loaded/Fact/fact_volunteer_applications.csv", index=False)
df_fact_volunteer_testimoni.to_csv("Data_Loaded/Fact/fact_volunteer_testimoni.csv", index=False)
df_fact_article_popular.to_csv("Data_Loaded/Fact/fact_article_popular.csv", index=False)
df_fact_bookmark_fundraising.to_csv("Data_Loaded/Fact/fact_bookmark_fundraising.csv", index=False)
df_fact_bookmark_volunteer_vacancies.to_csv("Data_Loaded/Fact/fact_bookmark_volunteer_vacancies.csv", index=False)

dim_article.to_csv("Data_Loaded/Dimensi/dim_article.csv", index=False)
dim_bookmark_fundraising.to_csv("Data_Loaded/Dimensi/dim_bookmark_fundraising.csv", index=False)
dim_bookmark_volunter_vacancies.to_csv("Data_Loaded/Dimensi/dim_bookmark_volunteer_vacancies.csv", index=False)
dim_comment.to_csv("Data_Loaded/Dimensi/dim_comment.csv", index=False)
dim_donation.to_csv("Data_Loaded/Dimensi/dim_donation.csv", index=False)
dim_fundraising_categories.to_csv("Data_Loaded/Dimensi/dim_fundraising_categories.csv", index=False)
dim_fundraisings.to_csv("Data_Loaded/Dimensi/dim_fundraising.csv", index=False)
dim_user.to_csv("Data_Loaded/Dimensi/dim_user.csv", index=False)
dim_testimoni_volunteer.to_csv("Data_Loaded/Dimensi/dim_testimoni_volunteer.csv", index=False)
dim_volunteer_vacancies.to_csv("Data_Loaded/Dimensi/dim_volunteer_vacancies.csv", index=False)
dim_volunteer_applictaion.to_csv("Data_Loaded/Dimensi/dim_volunteer_application.csv", index=False)
# dim_organization.to_csv("Data_Loaded/Dimensi/dim_organization.csv", index=False)

In [None]:
dim_organization.to_parquet("Data_Loaded/Dimensi/dim_organization.parquet", index=False)

In [None]:
dim_organization

##### 2. Load To DB Local

In [None]:
def load_db_local(df, table_name):
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='',
        port=3307
    )

    cursor = connection.cursor()

    cursor.execute('CREATE DATABASE IF NOT EXISTS peduli_pintar')
    cursor.execute('USE peduli_pintar')

    engine = create_engine('mysql+pymysql://root:@localhost:3307/peduli_pintar')
    df.to_sql(name=table_name, con=engine, if_exists='replace', index=False)
    connection.close()

In [None]:
load_db_local(df_fact_applications, 'fact_applications')
load_db_local(df_fact_article_popular, 'fact_article_popular')
load_db_local(df_fact_bookmark_fundraising, 'fact_bookmark_fundraising')
load_db_local(df_fact_bookmark_volunteer_vacancies, 'fact_bookmark_volunteer_vacancies')
load_db_local(df_fact_donation, 'fact_donation')
load_db_local(df_fact_volunteer_testimoni, 'fact_volunteer_testimoni')
load_db_local(dim_fundraisings, 'dim_fundraisings')
load_db_local(dim_fundraising_categories, 'dim_fundraising_categories')
load_db_local(dim_donation, 'dim_donation')
load_db_local(dim_organization, 'df_organization')
load_db_local(dim_user, 'dim_user')
load_db_local(dim_volunteer_applictaion, 'dim_volunteer_applictaion')
load_db_local(dim_volunteer_vacancies, 'dim_volunteer_vacancies')
load_db_local(dim_testimoni_volunteer, 'dim_testimoni_volunteer')
load_db_local(dim_article, 'dim_article')
load_db_local(dim_comment, 'dim_comment')
load_db_local(dim_bookmark_fundraising, 'dim_bookmark_fundraising')
load_db_local(dim_bookmark_volunter_vacancies, 'dim_bookmark_volunter_vacancies')

##### 3. Load To Big Query