In [None]:
from utils.extraction.extraction import *
from utils.transformation.transformation import *
from utils.load.load import *
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def extraction():
    contact_raw_data=contact_collector()
    return contact_raw_data

In [None]:
def duplicates_management(df):
    """
    Manages duplicates in a DataFrame based on email values.

    Args:
        df (pandas.DataFrame): Input DataFrame.

    Returns:
        temp_df (pandas.DataFrame): DataFrame with duplicates managed based on email values.
    """

    temp_df=df.copy()
    duplicates_list = temp_df['email'].duplicated(keep=False) & (df['email'].notna())
    duplicated = temp_df[duplicates_list]

    for _, group in duplicated.groupby('email'):
        
        ordered_group = group.sort_values('lastmodifieddate', ascending=False)

        ordered_group = ordered_group.fillna(method='bfill')

        industries = ';'+';'.join(str(industry) for industry in ordered_group['industry'].unique())

        ordered_group.loc[ordered_group.index[0], 'industry'] = industries


        id_list=list(ordered_group['hs_object_id'])
        value_to_replace=id_list[0]
        row_to_replace = temp_df[temp_df['hs_object_id'] == value_to_replace].index[0]
        temp_df.loc[row_to_replace] = ordered_group[ordered_group['hs_object_id'] == value_to_replace].iloc[0]

        reference_values = id_list[1:]
        temp_df = temp_df[~temp_df['hs_object_id'].isin(reference_values)]


    return temp_df

In [None]:
def transformation(contacts_df):
    contacts_df = contacts_df.drop('createdate', axis=1)
    contacts_df[['country','city']]=contacts_df['country'].apply(country_recognition).apply(pd.Series)
    contacts_df['raw_email']=contacts_df['raw_email'].apply(found_emails)
    contacts_df = contacts_df.rename(columns={'raw_email': 'email'})
    contacts_df['phone']=contacts_df.apply(lambda x: fix_phone_numbers(x['phone'], x['country']), axis=1)

    #Duplicates Management
    no_duplicates_df=duplicates_management(contacts_df)

    return no_duplicates_df

In [None]:
def load(df):
    records = df.to_dict(orient='records')
    for record in records:
        refix_cord=load_record_management(record)
        saving_contact(refix_cord)

In [None]:
#GRAPHICS

def plot_addresses_by_city(df):
    city_counts = df['city'].value_counts()
    plt.figure(figsize=(10, 6))
    city_counts.plot(kind='bar')
    plt.xlabel('City')
    plt.ylabel('Count')
    plt.title('Addresses by City')
    plt.show()

def plot_creation_date_trends(df):
    df['lastmodifieddate'] = pd.to_datetime(df['lastmodifieddate'])
    monthly_counts = df.groupby(df['lastmodifieddate'].dt.to_period('M')).size()

    plt.figure(figsize=(10, 6))
    monthly_counts.plot(kind='line', marker='o')
    plt.xlabel('Month')
    plt.ylabel('Count')
    plt.title('Creation Date Trends')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
def run_pipeline():

    #Extaction
    contacts_data=extraction()
    contacts_df=pd.DataFrame(contacts_data)

    #Transformation
    transformed_df=transformation(contacts_df)


    #Load
    load(transformed_df)

In [None]:
def show_graphics():
    csv_file = 'csv_results/transformed.csv'
    df = pd.read_csv(csv_file)
    plot_addresses_by_city(df)
    plot_creation_date_trends(df)

# Graph 1

Addresses by City: Explore the relationship between addresses and cities by visualizing the number of addresses in each city. Each city is represented by a bar or a marker, respectively, and the height or size represents the count of addresses.

# Graph 2

Creation Date Trends: analyze the created date attribute to identify any trends or patterns over time. The x-axis represents the time period (e.g., months, years) and the y-axis represents the count of records created during that period.

In [None]:
if __name__ == "__main__":
    
    run_pipeline()
    show_graphics()