In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from scipy import stats

def load_and_clean_data(file_path):
    """
    Load and perform initial data cleaning
    """
    df = pd.read_csv(file_path)
    
    # Convert categorical variables to categorical type
    categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 
                       'loan', 'contact', 'month', 'poutcome', 'y']
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    
    return df

def perform_descriptive_analysis(df):
    """
    Perform descriptive statistics and create summary visualizations
    """
    # Numeric variables summary
    numeric_summary = df.describe()
    numeric_summary.to_excel('numeric_summary.xlsx')
    
    # Categorical variables summary
    categorical_summary = {col: df[col].value_counts(normalize=True) 
                         for col in df.select_dtypes(include=['category']).columns}
    pd.DataFrame(categorical_summary).to_excel('categorical_summary.xlsx')

    
    # Age distribution analysis
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='age', bins=30)
    plt.title('Age Distribution of Customers')
    plt.savefig('age_distribution.png')
    plt.close()
    
    # Campaign success rate by education level
    plt.figure(figsize=(10, 6))
    success_by_education = df.groupby('education')['y'].apply(
        lambda x: (x == 'yes').mean()
    ).sort_values(ascending=False)
    success_by_education.to_excel('success_by_education.xlsx')
    success_by_education.plot(kind='bar')
    plt.title('Campaign Success Rate by Education Level')
    plt.tight_layout()
    plt.savefig('success_by_education.png')
    plt.close()
    
    return numeric_summary, categorical_summary

def perform_segmentation_analysis(df):
    """
    Perform customer segmentation analysis
    """
    # Prepare data for clustering
    # We'll use age, balance, and campaign as features
    features_for_clustering = ['age', 'balance', 'campaign']
    X = df[features_for_clustering].copy()
    
    # Standardize the features
    X_scaled = (X - X.mean()) / X.std()
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    df['cluster'] = kmeans.fit_predict(X_scaled)
    
    # Analyze clusters
    cluster_summary = df.groupby('cluster').agg({
        'age': 'mean',
        'balance': 'mean',
        'campaign': 'mean',
        'y': lambda x: (x == 'yes').mean()
    }).round(2)
    
    # Visualize clusters
    plt.figure(figsize=(10, 6))
    for cluster in range(4):
        cluster_data = df[df['cluster'] == cluster]
        plt.scatter(cluster_data['age'], cluster_data['balance'], 
                   label=f'Cluster {cluster}', alpha=0.5)
        cluster_data.to_excel('cluster_data.xlsx')
    plt.xlabel('Age')
    plt.ylabel('Balance')
    plt.title('Customer Segments: Age vs Balance')
    plt.legend()
    plt.savefig('customer_segments.png')
    plt.close()
    
    return cluster_summary

def analyze_temporal_patterns(df):
    """
    Analyze temporal patterns in campaign success
    """
    # Success rate by month
    monthly_success = df.groupby('month')['y'].apply(
        lambda x: (x == 'yes').mean()
    ).reindex(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
               'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
    monthly_success.to_excel('monthly_success.xlsx')
    
    # Success rate by day of month
    daily_success = df.groupby('day')['y'].apply(
        lambda x: (x == 'yes').mean(),
    daily_success.to_excel('daily_success.xlsx')
    )
    
    # Visualize monthly patterns
    plt.figure(figsize=(12, 6))
    monthly_success.plot(kind='bar')
    plt.title('Campaign Success Rate by Month')
    plt.tight_layout()
    plt.savefig('monthly_success.png')
    plt.close()
    
    # Visualize daily patterns
    plt.figure(figsize=(12, 6))
    daily_success.plot(kind='line')
    plt.title('Campaign Success Rate by Day of Month')
    plt.tight_layout()
    plt.savefig('daily_success.png')
    plt.close()
    
    return monthly_success, daily_success

def main():
    # Load the data
    df = load_and_clean_data(r"C:\Users\Admin\Desktop\bank analysis\bank analysis\bank1.csv")
    
    # 1. Descriptive Analysis
    numeric_summary, categorical_summary = perform_descriptive_analysis(df)
    print("\n=== Numeric Summary ===")
    print(numeric_summary)
    print("\n=== Categorical Summary ===")
    for category, summary in categorical_summary.items():
        print(f"\n{category}:")
        print(summary)
    
    # 2. Segmentation Analysis
    cluster_summary = perform_segmentation_analysis(df)
    print("\n=== Cluster Summary ===")
    print(cluster_summary)
    cluster_summary.to_excel('cluster_summary.xlsx')
    
    # 3. Temporal Analysis
    monthly_success, daily_success = analyze_temporal_patterns(df)
    print("\n=== Monthly Success Rates ===")
    print(monthly_success)
    print("\n=== Daily Success Rates ===")
    print(daily_success)

if __name__ == "__main__":
    main()

  success_by_education = df.groupby('education')['y'].apply(



=== Numeric Summary ===
               age       balance          day     duration     campaign  \
count  4521.000000   4521.000000  4521.000000  4521.000000  4521.000000   
mean     41.170095   1422.657819    15.915284   263.961292     2.793630   
std      10.576211   3009.638142     8.247667   259.856633     3.109807   
min      19.000000  -3313.000000     1.000000     4.000000     1.000000   
25%      33.000000     69.000000     9.000000   104.000000     1.000000   
50%      39.000000    444.000000    16.000000   185.000000     2.000000   
75%      49.000000   1480.000000    21.000000   329.000000     3.000000   
max      87.000000  71188.000000    31.000000  3025.000000    50.000000   

             pdays     previous  
count  4521.000000  4521.000000  
mean     39.766645     0.542579  
std     100.121124     1.693562  
min      -1.000000     0.000000  
25%      -1.000000     0.000000  
50%      -1.000000     0.000000  
75%      -1.000000     0.000000  
max     871.000000    25.00

  monthly_success = df.groupby('month')['y'].apply(


UnboundLocalError: cannot access local variable 'daily_success' where it is not associated with a value