In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

def load_data():
    # Create synthetic dataset
    np.random.seed(42)
    n = 200
    df = pd.DataFrame({
        'CustomerID': range(1, n+1),
        'Gender': np.random.choice(['Male', 'Female'], size=n),
        'Age': np.random.randint(18, 70, size=n),
        'JoinDate': pd.to_datetime(np.random.choice(pd.date_range(start='2022-01-01', end='2024-12-31'), size=n)),
        'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], size=n)
    })
    return df

def clean_data(df):
    # Drop duplicates and handle missing values (if any)
    df = df.drop_duplicates()
    df = df.dropna()
    return df

def visualize_data(df):
    sns.set(style="whitegrid")
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Bar Plot
    sns.countplot(data=df, x='Contract', palette='pastel', ax=axes[0])
    axes[0].set_title('Customers by Contract Type')
    axes[0].tick_params(axis='x', rotation=45)

    # Line Chart
    monthly_trend = df.groupby(df['JoinDate'].dt.to_period('M')).size()
    monthly_trend.index = monthly_trend.index.to_timestamp()
    monthly_trend.plot(kind='line', marker='o', ax=axes[1])
    axes[1].set_title('Customer Signups Over Time')
    axes[1].tick_params(axis='x', rotation=45)

    # Histogram
    sns.histplot(df['Age'], bins=15, kde=True, color='skyblue', ax=axes[2])
    axes[2].set_title('Age Distribution of Customers')

    plt.tight_layout()
    plt.savefig("visualizations.png")
    plt.close()

def summarize_data(df):
    summary = df.describe(include='all')
    print("Data Summary:\n", summary)
    return summary

def export_data(df, summary):
    os.makedirs("output", exist_ok=True)
    df.to_csv("Datasets/cleaned_data.csv", index=False)
    summary.to_csv("Datasets/data_summary.csv")

def main():
    df = load_data()
    df_clean = clean_data(df)
    visualize_data(df_clean)
    summary = summarize_data(df_clean)
    export_data(df_clean, summary)

if __name__ == "__main__":
    main()

  summary = df.describe(include='all')


Data Summary:
         CustomerID Gender         Age             JoinDate        Contract
count   200.000000    200  200.000000                  200             200
unique         NaN      2         NaN                  188               3
top            NaN   Male         NaN  2023-02-01 00:00:00  Month-to-month
freq           NaN    100         NaN                    2              84
first          NaN    NaN         NaN  2022-01-05 00:00:00             NaN
last           NaN    NaN         NaN  2024-12-28 00:00:00             NaN
mean    100.500000    NaN   44.405000                  NaN             NaN
std      57.879185    NaN   15.353816                  NaN             NaN
min       1.000000    NaN   18.000000                  NaN             NaN
25%      50.750000    NaN   32.750000                  NaN             NaN
50%     100.500000    NaN   46.000000                  NaN             NaN
75%     150.250000    NaN   56.000000                  NaN             NaN
max     20

OSError: Cannot save file into a non-existent directory: 'Datasets'