In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_ingestion.csv_data_loader import CustomerSupportDataLoader
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load data
loader = CustomerSupportDataLoader()
df = loader.load_and_validate_csv('customer_support_tickets.csv')
df_processed = loader.clean_and_preprocess(df)
df_features = loader.generate_features(df_processed)

# Get statistics
stats = loader.get_data_statistics(df_features)

print("=== CUSTOMER SUPPORT TICKETS DATA ANALYSIS ===\n")

# Basic Information
print("📊 BASIC INFORMATION:")
print(f"Total Tickets: {stats['basic_info']['total_tickets']:,}")
print(f"Date Range: {stats['basic_info']['date_range']['start']} to {stats['basic_info']['date_range']['end']}")
print(f"Columns: {len(df.columns)}")
print()

# Ticket Distribution
print("🎫 TICKET DISTRIBUTION:")
print("By Type:")
for ticket_type, count in stats['ticket_distribution']['by_type'].items():
    percentage = (count / stats['basic_info']['total_tickets']) * 100
    print(f"  {ticket_type}: {count:,} ({percentage:.1f}%)")
print()

print("By Priority:")
for priority, count in stats['ticket_distribution']['by_priority'].items():
    percentage = (count / stats['basic_info']['total_tickets']) * 100
    print(f"  {priority}: {count:,} ({percentage:.1f}%)")
print()

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Ticket Type Distribution
df_features['ticket_type'].value_counts().plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Tickets by Type')
axes[0,0].set_xlabel('Ticket Type')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Priority Distribution
df_features['ticket_priority'].value_counts().plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
axes[0,1].set_title('Tickets by Priority')

# 3. Channel Distribution
df_features['ticket_channel'].value_counts().plot(kind='bar', ax=axes[0,2])
axes[0,2].set_title('Tickets by Channel')
axes[0,2].set_xlabel('Channel')
axes[0,2].set_ylabel('Count')

# 4. Customer Age Distribution
df_features['customer_age'].hist(bins=20, ax=axes[1,0])
axes[1,0].set_title('Customer Age Distribution')
axes[1,0].set_xlabel('Age')
axes[1,0].set_ylabel('Frequency')

# 5. Response Time Distribution
df_features['first_response_time'].hist(bins=30, ax=axes[1,1])
axes[1,1].set_title('First Response Time Distribution')
axes[1,1].set_xlabel('Hours')
axes[1,1].set_ylabel('Frequency')

# 6. Satisfaction Rating Distribution
df_features['customer_satisfaction_rating'].hist(bins=20, ax=axes[1,2])
axes[1,2].set_title('Customer Satisfaction Distribution')
axes[1,2].set_xlabel('Rating')
axes[1,2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Text Analysis
print("📝 TEXT ANALYSIS:")
print(f"Average Subject Length: {stats['text_statistics']['avg_subject_length']:.1f} characters")
print(f"Average Description Length: {stats['text_statistics']['avg_description_length']:.1f} characters")
print(f"Average Word Count: {stats['text_statistics']['avg_word_count']:.1f} words")
print()

# Performance Metrics
print("⏱️ PERFORMANCE METRICS:")
print(f"Average First Response Time: {stats['performance_metrics']['avg_response_time']:.2f} hours")
print(f"Average Resolution Time: {stats['performance_metrics']['avg_resolution_time']:.2f} hours")
print(f"Average Satisfaction Rating: {stats['performance_metrics']['avg_satisfaction']:.2f}/5.0")
print()

# Missing Values Analysis
print("❌ MISSING VALUES:")
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percentage
}).sort_values('Missing Count', ascending=False)

print(missing_summary[missing_summary['Missing Count'] > 0])

# Correlation Analysis for Numeric Columns
numeric_columns = ['customer_age', 'first_response_time', 'time_to_resolution', 
                  'customer_satisfaction_rating', 'priority_numeric']

correlation_matrix = df_features[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()

print("🔗 KEY CORRELATIONS:")
# Find strongest correlations
corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.3:  # Only show moderate to strong correlations
            corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                corr_value
            ))

for var1, var2, corr in sorted(corr_pairs, key=lambda x: abs(x[2]), reverse=True):
    print(f"  {var1} ↔ {var2}: {corr:.3f}")
