In [6]:
import pandas as pd
import sys, os

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from utils import DatabaseEngine

In [7]:
db_engine = DatabaseEngine()
engine = db_engine.create()
clean_df = pd.read_sql_table("clean_xdr_data", con=engine)

Successfully connected to the PostgreSQL "telecom" database


#### Aggregate overview of the users’ behaviour

In [13]:
def process_user_info(clean_df):
    # number of xDR sessions
    num_sessions = clean_df.groupby('MSISDN/Number')['Bearer Id'].count()

    # Session duration
    session_duration = clean_df.groupby('MSISDN/Number')['Dur. (ms)'].sum()

    # the total download (DL) and upload (UL) data
    total_DL = clean_df.groupby('MSISDN/Number')['Total DL (Bytes)'].sum()
    total_UL = clean_df.groupby('MSISDN/Number')['Total UL (Bytes)'].sum()

    # the total data volume (in Bytes) during this session for each application
    total_data_vol = clean_df.groupby('MSISDN/Number')[['Social Media DL (Bytes)', 'Social Media UL (Bytes)',
                                                   'Google DL (Bytes)', 'Google UL (Bytes)',
                                                   'Email DL (Bytes)', 'Email UL (Bytes)',
                                                   'Youtube DL (Bytes)', 'Youtube UL (Bytes)',
                                                   'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                                                   'Gaming DL (Bytes)', 'Gaming UL (Bytes)',
                                                   'Other DL (Bytes)', 'Other UL (Bytes)']].sum()
    user_info = pd.concat([num_sessions, session_duration, total_DL, total_UL, total_data_vol], axis=1)
    user_info = user_info.rename(columns={
        'Bearer Id': 'Number of Sessions',
        'Dur. (ms)': 'Total Session Duration',
        'Total DL (Bytes)': 'Total Download Data',
        'Total UL (Bytes)': 'Total Upload Data',
        'Social Media DL (Bytes)': 'Social Media Download Data',
        'Social Media UL (Bytes)': 'Social Media Upload Data',
        'Google DL (Bytes)': 'Google Download Data',
        'Google UL (Bytes)': 'Google Upload Data',
        'Email DL (Bytes)': 'Email Download Data',
        'Email UL (Bytes)': 'Email Upload Data',
        'Youtube DL (Bytes)': 'Youtube Download Data',
        'Youtube UL (Bytes)': 'Youtube Upload Data',
        'Netflix DL (Bytes)': 'Netflix Download Data',
        'Netflix UL (Bytes)': 'Netflix Upload Data',
        'Gaming DL (Bytes)': 'Gaming Download Data',
        'Gaming UL (Bytes)': 'Gaming Upload Data',
        'Other DL (Bytes)': 'Other Download Data',
        'Other UL (Bytes)': 'Other Upload Data'
    })
    
    return user_info

filtered_df = clean_df[clean_df['MSISDN/Number'] != 'Unknown']
user_info = process_user_info(filtered_df)
pd.DataFrame(user_info)

# Save DataFrame to CSV
# user_info.to_csv('user_info.csv')