# Task 1 - User Overview Analysis

Task 1.1

using pandas we can group the data by user_id and aggregate the required metrics

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('your_dataset.csv')

# Aggregate user behavior
user_behavior = df.groupby('user_id').agg(
    number_of_xdr_sessions=('session_id', 'count'),
    total_session_duration=('session_duration', 'sum'),
    total_download_data=('download_data', 'sum'),
    total_upload_data=('upload_data', 'sum'),
    total_data_volume=('data_volume', 'sum')  # Assuming 'data_volume' is already in Bytes
).reset_index()

# Display the aggregated user behavior
print(user_behavior)

Task 1.2

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('your_dataset.csv')

# Describe data types and relevant variables
data_info = df.dtypes
data_description = df.describe(include='all')

# Display information
print(data_info)
print(data_description)

Variable transformations

In [None]:
# Segment users into decile classes based on total session duration
df['total_session_duration'] = df.groupby('user_id')['session_duration'].transform('sum')
df['decile_class'] = pd.qcut(df['total_session_duration'], 5, labels=False)

# Compute total data (DL + UL) per decile class
data_per_decile = df.groupby('decile_class').agg(
    total_data_per_decile=('download_data', 'sum') + ('upload_data', 'sum')
).reset_index()
print(data_per_decile)

Analyze the basic metrics

In [None]:
# Basic metrics
basic_metrics = df[['session_duration', 'download_data', 'upload_data']].agg(['mean', 'median', 'std', 'min', 'max'])
print(basic_metrics)

 Non-Graphical Univariate Analysis

In [None]:
# Dispersion parameters
dispersion_metrics = df[['session_duration', 'download_data', 'upload_data']].agg(['mean', 'median', 'std', 'min', 'max', 'var'])
print(dispersion_metrics)

 Graphical Univariate Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histograms for quantitative variables
plt.figure(figsize=(15, 5))
for i, column in enumerate(['session_duration', 'download_data', 'upload_data']):
    plt.subplot(1, 3, i + 1)
    sns.histplot(df[column], bins=30, kde=True)
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

Bivariate Analysis

In [None]:
# Bivariate analysis between applications and total DL+UL data
applications = ['Social Media', 'Google', 'Email', 'YouTube', 'Netflix', 'Gaming', 'Other']
for app in applications:
    sns.scatterplot(data=df, x=app, y='download_data' + 'upload_data')
    plt.title(f'Relationship between {app} and Total Data (DL+UL)')
    plt.show()

Correlation Analysis 

In [None]:
# Correlation matrix
correlation_matrix = df[applications].corr()
print(correlation_matrix)

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Applications')
plt.show()

Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

# Select features for PCA
features = df[applications]
pca = PCA(n_components=2)
principal_components = pca.fit_transform(features)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
print(pca_df.head())