In [None]:
import sys, os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
sys.path.append('/media/moraa/New Volume/Ontita/10Academy/Cohort B/Projects/Week1/User_Analytics_in_the_telecommunication_Industry')

from dotenv import load_dotenv
from utils.db_connections import DBConnection
from utils.clean import DataCleaner
from utils.plots import plot_distribution, plot_boxplot, plot_heatmap, plot_countplot, plot_histplot

In [None]:
# Create an instance of DBConnection
db_conn = DBConnection()

# Specify the table name you want to read
table_name = 'xdr_data'

# Read data from the specified table into a DataFrame
df = db_conn.read_table_to_dataframe(table_name)

df.head()

## Data Preprocessing

Handling Missing Values

In [None]:
# Create an instance of DataCleaner
cleaner = DataCleaner()

# Remove columns with missing values exceeding the threshold
df = cleaner.remove_columns_with_missing_values(df)

# Fill missing values in numerical columns
df = cleaner.fill_missing_values_numerical(df)

# Fill missing values in categorical columns
df = cleaner.fill_missing_values_categorical(df)


In [None]:

df.isnull().sum()

In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64'])

# Print numerical columns
print("Numerical Columns:")
print(numerical_cols)

In [None]:
# Select categorical columns
categorical_cols = df.select_dtypes(include=['object'])

# Print categorical columns
print("\nCategorical Columns:")
print(categorical_cols)

In [None]:
# Calculate the number of rows and columns for subplots
num_cols = len(numerical_cols.columns)
num_rows = (num_cols + 2) // 3  # Adjust as needed

# Create boxplots for each numerical column
plt.figure(figsize=(18, num_rows * 6))  # Adjust the figure height based on the number of rows
for i, col in enumerate(numerical_cols.columns):
    plt.subplot(num_rows, 3, i+1)  # Adjust the subplot layout
    plt.boxplot(df[col], vert=False)
    plt.title('Boxplot of ' + col)
    plt.xlabel('Values')
    plt.ylabel(col)
    plt.grid(True)

plt.tight_layout()
plt.show()

User Overview Analysis

In [None]:
# Step 2: Compute Dispersion Parameters
dispersion_params = {}
for col in numerical_cols.columns:
    data = numerical_cols[col]
    # Range
    data_range = data.max() - data.min()
    # Variance
    data_variance = data.var()
    # Standard Deviation
    data_std_dev = data.std()
    # Interquartile Range (IQR)
    data_iqr = data.quantile(0.75) - data.quantile(0.25)
    
    dispersion_params[col] = {
        'Range': data_range,
        'Variance': data_variance,
        'Standard Deviation': data_std_dev,
        'Interquartile Range (IQR)': data_iqr
    }

# Step 3: Interpretation
for col, params in dispersion_params.items():
    print(f"Dispersion Parameters for {col}:")
    print(params)

Graphical Univariate Analysis

In [None]:
# Calculate the number of rows and columns for subplots
num_cols = len(numerical_cols.columns)
num_rows = (num_cols // 3) + (num_cols % 3 > 0)  # Calculate number of rows needed

# Create a single figure for all histograms
plt.figure(figsize=(18, 12))  # Larger figsize

# Loop through each numerical column and plot the histogram
for i, col in enumerate(numerical_cols.columns, 1):  # Start subplot index from 1
    plt.subplot(num_rows, 3, i)
    plot_histplot(df, x_column=col, y_column=None)  # Assuming y_column is not needed
    plt.title('Histogram of ' + col)

plt.tight_layout()
plt.show()

In [None]:
# Plot boxplots for numerical columns
plt.figure(figsize=(18, 12))  # Larger figsize
num_cols = len(numerical_cols.columns)
num_rows = (num_cols // 3) + (num_cols % 3 > 0)  # Calculate number of rows needed
for i, col in enumerate(numerical_cols.columns):
    plt.subplot(3, num_rows, i+1)
    sns.boxplot(y=numerical_cols[col])
    plt.title('Boxplot of ' + col)
plt.tight_layout()
plt.show()

In [None]:
# Select pairs of quantitative columns for scatter plot visualization
quantitative_pairs = [('Total UL (Bytes)', 'Total DL (Bytes)'),
                      ('Social Media DL (Bytes)', 'Social Media UL (Bytes)')]

# Plot scatter plots for quantitative pairs
plt.figure(figsize=(12, 8))
for i, pair in enumerate(quantitative_pairs):
    plt.subplot(1, 2, i+1)
    sns.scatterplot(x=pair[0], y=pair[1], data=df)
    plt.title('Scatter Plot of ' + pair[0] + ' vs ' + pair[1])
plt.tight_layout()
plt.show()

In [None]:

# Create a figure with a larger size
plt.figure(figsize=(16, 10))  # Adjust the figsize as needed

# Determine the number of rows and columns for subplots
num_cols = 3  # Set the number of columns
num_rows = (len(categorical_cols.columns) - 1) // num_cols + 1  # Calculate the number of rows dynamically

# Loop through each categorical column and create a subplot
for i, col in enumerate(categorical_cols.columns):
    plt.subplot(num_rows, num_cols, i+1)  # Adjust the subplot layout as needed
    sns.countplot(x=col, data=df)
    plt.title('Bar Plot of ' + col)

plt.tight_layout()
plt.show()

In [None]:
# Plot time series for 'Start' variable
plt.figure(figsize=(12, 6))
sns.lineplot(x=categorical_cols['Start'], y=categorical_cols.index, data=df)
plt.title('Time Series Plot of Start Timestamp')
plt.xlabel('Start Timestamp')
plt.ylabel('Index')
plt.tight_layout()
plt.show()


Bivariate Analysis

In [None]:
# Concatenate the two DataFrames along axis 1 (columns)
combined_df = pd.concat([categorical_cols, numerical_cols], axis=1)

combined_df.head()

In [None]:
# Select relevant columns for applications data volume
# Adjust application column names to include suffixes
applications_columns = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)', 
                        'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)', 'Other DL (Bytes)',
                        'Social Media UL (Bytes)', 'Google UL (Bytes)', 'Email UL (Bytes)', 
                        'Youtube UL (Bytes)', 'Netflix UL (Bytes)', 'Gaming UL (Bytes)', 'Other UL (Bytes)']


# Calculate the correlation coefficients
correlation = combined_df[applications_columns].corrwith(combined_df['Total DL (Bytes)'] + combined_df['Total UL (Bytes)'])

print("Correlation coefficients between each application and Total DL+UL data:")
print(correlation)

In [None]:
# Convert correlation to DataFrame for plotting
correlation_df = pd.DataFrame(correlation, columns=['Correlation'])

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_df.transpose(), annot=True, cmap='coolwarm', cbar=False)
plt.title('Correlation between Applications and Total DL+UL Data Volume')
plt.xlabel('Applications')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()


In [None]:
# Calculate total duration for all sessions for each user
combined_df['Total Duration (s)'] = (combined_df['Dur. (ms)'] + combined_df['Dur. (ms).1']) / 1000

# Group by MSISDN/Number (user) and calculate the total duration
user_total_duration = combined_df.groupby('MSISDN/Number')['Total Duration (s)'].sum()

# Segment users into deciles based on total duration
user_total_duration_deciles = pd.qcut(user_total_duration, q=10, labels=False)

# Assign decile class to each user in the original DataFrame
combined_df['Decile Class'] = combined_df['MSISDN/Number'].map(user_total_duration_deciles)

# Compute the total data (DL+UL) for each decile class
total_data_per_decile = combined_df.groupby('Decile Class')[['Total DL (Bytes)', 'Total UL (Bytes)']].sum()

# Compute the total data (DL+UL) in bytes
total_data_per_decile['Total Data (DL+UL)'] = total_data_per_decile['Total DL (Bytes)'] + total_data_per_decile['Total UL (Bytes)']

# Sort the deciles by total data in descending order
total_data_per_decile.sort_values(by='Total Data (DL+UL)', ascending=False, inplace=True)

# Display the result
print("Total data (DL+UL) per decile class:")
print(total_data_per_decile)

In [None]:
# Select the columns for correlation analysis
selected_columns = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                    'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                    'Other DL (Bytes)']

# Compute the correlation matrix
correlation_matrix = combined_df[selected_columns].corr()

# Interpret the findings
print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
# Perform PCA
pca = PCA(n_components=2)  # You can adjust the number of components as needed
principal_components = pca.fit_transform(combined_df)

# Create a DataFrame to store the principal components
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

In [None]:
# Plot the principal components 
plt.scatter(principal_df['PC1'], principal_df['PC2'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA')
plt.show()

In [None]:
# Select the columns for PCA
selected_columns = ['Social Media DL (Bytes)', 'Google DL (Bytes)', 'Email DL (Bytes)',
                    'Youtube DL (Bytes)', 'Netflix DL (Bytes)', 'Gaming DL (Bytes)',
                    'Other DL (Bytes)']

# Perform PCA
pca = PCA(n_components=2)  # You can adjust the number of components as needed
principal_components = pca.fit_transform(combined_df[selected_columns])

# Create a DataFrame to store the principal components
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])