In [2]:
import pandas as pd
import sys, os

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from utils import DatabaseEngine

In [3]:
db_engine = DatabaseEngine()
engine = db_engine.create()
clean_df = pd.read_sql_table("clean_xdr_data", con=engine)

Successfully connected to the PostgreSQL "telecom" database


#### Aggregate overview of the users’ behaviour

In [4]:
def process_user_info(clean_df):
    # number of xDR sessions
    num_sessions = clean_df.groupby('MSISDN/Number')['Bearer Id'].count()

    # Session duration
    session_duration = clean_df.groupby('MSISDN/Number')['Dur. (ms)'].sum()

    # the total download (DL) and upload (UL) data
    total_DL = clean_df.groupby('MSISDN/Number')['Total DL (Bytes)'].sum()
    total_UL = clean_df.groupby('MSISDN/Number')['Total UL (Bytes)'].sum()

    # the total data volume (in Bytes) during this session for each application
    total_data_vol = clean_df.groupby('MSISDN/Number')[['Social Media DL (Bytes)', 'Social Media UL (Bytes)',
                                                   'Google DL (Bytes)', 'Google UL (Bytes)',
                                                   'Email DL (Bytes)', 'Email UL (Bytes)',
                                                   'Youtube DL (Bytes)', 'Youtube UL (Bytes)',
                                                   'Netflix DL (Bytes)', 'Netflix UL (Bytes)',
                                                   'Gaming DL (Bytes)', 'Gaming UL (Bytes)',
                                                   'Other DL (Bytes)', 'Other UL (Bytes)']].sum()
    user_info = pd.concat([num_sessions, session_duration, total_DL, total_UL, total_data_vol], axis=1)
    user_info = user_info.rename(columns={
        'Bearer Id': 'Number of Sessions',
        'Dur. (ms)': 'Total Session Duration',
        'Total DL (Bytes)': 'Total Download Data',
        'Total UL (Bytes)': 'Total Upload Data',
        'Social Media DL (Bytes)': 'Social Media Download Data',
        'Social Media UL (Bytes)': 'Social Media Upload Data',
        'Google DL (Bytes)': 'Google Download Data',
        'Google UL (Bytes)': 'Google Upload Data',
        'Email DL (Bytes)': 'Email Download Data',
        'Email UL (Bytes)': 'Email Upload Data',
        'Youtube DL (Bytes)': 'Youtube Download Data',
        'Youtube UL (Bytes)': 'Youtube Upload Data',
        'Netflix DL (Bytes)': 'Netflix Download Data',
        'Netflix UL (Bytes)': 'Netflix Upload Data',
        'Gaming DL (Bytes)': 'Gaming Download Data',
        'Gaming UL (Bytes)': 'Gaming Upload Data',
        'Other DL (Bytes)': 'Other Download Data',
        'Other UL (Bytes)': 'Other Upload Data'
    })
    
    return user_info

filtered_df = clean_df[clean_df['MSISDN/Number'] != 'Unknown']
user_info = process_user_info(filtered_df)
pd.DataFrame(user_info)

# Save DataFrame to CSV
# user_info.to_csv('user_info.csv')

Unnamed: 0_level_0,Number of Sessions,Total Session Duration,Total Download Data,Total Upload Data,Social Media Download Data,Social Media Upload Data,Google Download Data,Google Upload Data,Email Download Data,Email Upload Data,Youtube Download Data,Youtube Upload Data,Netflix Download Data,Netflix Upload Data,Gaming Download Data,Gaming Upload Data,Other Download Data,Other Upload Data
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3197020876596.0,1,877385.0,1.948281e+08,37295915.0,668596.0,46628.0,8572779.0,1865881.0,842279.0,678492.0,9839889.0,2120016.0,10340908.0,16251392.0,164563605.0,13485133.0,4.676781e+08,2848373.0
33601001722.0,1,116720.0,8.426375e+08,36053108.0,2206504.0,25631.0,3337123.0,1051882.0,837400.0,493962.0,14900201.0,6724347.0,10265105.0,16915876.0,811091133.0,1367528.0,3.770970e+08,9473882.0
33601001754.0,1,181230.0,1.207552e+08,36104459.0,2598548.0,62017.0,4197697.0,1137166.0,2828821.0,478960.0,5324251.0,7107972.0,770569.0,10451194.0,105035298.0,14714780.0,2.795577e+08,2152370.0
33601002511.0,1,134969.0,5.566597e+08,39306820.0,3148004.0,47619.0,3343483.0,99643.0,2436500.0,768880.0,2137272.0,19196298.0,16525919.0,2827981.0,529068485.0,9759228.0,4.950865e+08,6607171.0
33601007832.0,1,49878.0,4.019932e+08,20327526.0,251469.0,28825.0,5937765.0,3740728.0,2178618.0,106052.0,4393123.0,2584198.0,1157362.0,784730.0,388074835.0,3051292.0,2.524800e+07,10031701.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33789967113.0,1,160461.0,1.740564e+08,34066711.0,1390786.0,22584.0,7178720.0,2714162.0,3301996.0,94611.0,19741224.0,8485255.0,19911928.0,10458374.0,122531702.0,12284217.0,4.080997e+08,7508.0
33789980299.0,2,210389.0,1.023862e+09,70831420.0,4200896.0,49416.0,5035730.0,5210743.0,4750550.0,564777.0,20353018.0,17659794.0,30828612.0,21121699.0,958693160.0,13651836.0,1.062567e+09,12573155.0
33789996170.0,1,8810.0,6.879252e+08,26716429.0,234320.0,65863.0,6834178.0,697091.0,480946.0,525969.0,8294310.0,18353533.0,14754741.0,147797.0,657326717.0,3034642.0,2.913913e+08,3891534.0
33789997247.0,1,140988.0,4.445751e+08,35732243.0,442214.0,56355.0,1472406.0,3957299.0,2513433.0,664.0,5596862.0,14254710.0,6929961.0,1601099.0,427620216.0,9383076.0,2.046361e+08,6479040.0
