# Telecom User Overview Analysis

This notebook contains the comprehensive analysis of telecom user data focusing on:
1. Handset Analysis
2. User Application Behavior
3. Exploratory Data Analysis
4. Statistical Analysis

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Initial Exploration

In [None]:
# Load the datasets
# Note: Update these paths with your actual data file paths
# handset_data = pd.read_csv('path_to_handset_data.csv')
# xdr_data = pd.read_csv('path_to_xdr_data.csv')

## 2. Handset Analysis

### 2.1 Top 10 Handsets

In [None]:
def analyze_top_handsets(df, n=10):
    """Analyze top n handsets"""
    top_handsets = df['handset_type'].value_counts().head(n)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_handsets.values, y=top_handsets.index)
    plt.title(f'Top {n} Handsets')
    plt.xlabel('Count')
    plt.ylabel('Handset Type')
    plt.show()
    
    return top_handsets

### 2.2 Top 3 Manufacturers

In [None]:
def analyze_manufacturers(df, n=3):
    """Analyze top n manufacturers"""
    top_manufacturers = df['manufacturer'].value_counts().head(n)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_manufacturers.values, y=top_manufacturers.index)
    plt.title(f'Top {n} Manufacturers')
    plt.xlabel('Count')
    plt.ylabel('Manufacturer')
    plt.show()
    
    return top_manufacturers

## 3. User Application Behavior Analysis

In [None]:
def aggregate_user_behavior(df):
    """Aggregate user behavior metrics"""
    user_metrics = df.groupby('user_id').agg({
        'session_id': 'count',  # number of xDR sessions
        'duration': 'sum',      # total session duration
        'download_data': 'sum', # total download
        'upload_data': 'sum',   # total upload
    }).reset_index()
    
    # Add total data volume
    user_metrics['total_data'] = user_metrics['download_data'] + user_metrics['upload_data']
    
    return user_metrics

## 4. Exploratory Data Analysis

In [None]:
def perform_eda(df):
    """Perform exploratory data analysis"""
    # Basic statistics
    print("\nBasic Statistics:")
    print(df.describe())
    
    # Missing values analysis
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Data types
    print("\nData Types:")
    print(df.dtypes)

## 5. Statistical Analysis

In [None]:
def correlation_analysis(df):
    """Perform correlation analysis on application data"""
    # Select relevant columns
    app_columns = ['social_media_data', 'google_data', 'email_data', 
                   'youtube_data', 'netflix_data', 'gaming_data', 'other_data']
    
    # Compute correlation matrix
    corr_matrix = df[app_columns].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Application Data')
    plt.show()
    
    return corr_matrix

## 6. Dimensionality Reduction

In [None]:
def perform_pca(df):
    """Perform PCA on the dataset"""
    # Select numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    
    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numerical_cols])
    
    # Perform PCA
    pca = PCA()
    pca_result = pca.fit_transform(scaled_data)
    
    # Plot explained variance ratio
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
             np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA Analysis')
    plt.show()
    
    return pca