In [1]:
import os, sys

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import scripts.read_data_from_db as rd
import scripts.write_to_db as wd
import scripts.data_cleaning as sd 


In [2]:
df = rd.read_data()


INFO:scripts.read_data_from_db:Data fetched succesfully


### Drop columns with 70% or more missing values

In [5]:
df = sd.drop_high_missing_columns(df, 0.7)
df.columns


Index(['Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'Dur. (ms)', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Activity Duration DL (ms)',
       'Activity Duration UL (ms)', 'Dur. (ms).1', 'Handset Manufacturer',
       'Handset Type', 'Nb of sec with 125000B < Vol DL',
       'Nb of sec with 1250B < Vol UL < 6250B',
       'Nb of sec with 31250B < Vol DL < 125000B',
       'Nb of sec with 6250B < Vol DL < 31250B',
       'Nb of sec with Vol DL < 6250B', 'Nb of sec with Vol UL < 1250B',
       'Social Me

### Remove missing values

In [6]:
def remove_missing_values():
    columns_to_check = [
    'Start', 
    'Start ms', 
    'End', 
    'End ms', 
    'Dur. (ms)',
    'Avg Bearer TP DL (kbps)',
    'Avg Bearer TP UL (kbps)',
    'Activity Duration DL (ms)',                        
    'Activity Duration UL (ms)',                        
    'Dur. (ms).1',
    'Total UL (Bytes)',                                 
    'Total DL (Bytes)'
    ]

    return sd.remove_rows_with_missing_values(df, columns_to_check)

cleaned_df = remove_missing_values()

In [7]:
cleaned_df.isnull().sum()

Bearer Id                                     990
Start                                           0
Start ms                                        0
End                                             0
End ms                                          0
Dur. (ms)                                       0
IMSI                                          569
MSISDN/Number                                1065
IMEI                                          571
Last Location Name                           1152
Avg RTT DL (ms)                             27828
Avg RTT UL (ms)                             27811
Avg Bearer TP DL (kbps)                         0
Avg Bearer TP UL (kbps)                         0
TCP DL Retrans. Vol (Bytes)                 88145
TCP UL Retrans. Vol (Bytes)                 96648
DL TP < 50 Kbps (%)                           753
50 Kbps < DL TP < 250 Kbps (%)                753
250 Kbps < DL TP < 1 Mbps (%)                 753
DL TP > 1 Mbps (%)                            753


### Impute columns with the mean value


In [8]:
def impute_columns():
    columns_to_impute = [
        "Avg RTT DL (ms)",                             
        "Avg RTT UL (ms)",                              
        "TCP DL Retrans. Vol (Bytes)",                  
        "TCP UL Retrans. Vol (Bytes)",                  
        "DL TP < 50 Kbps (%)",                            
        "50 Kbps < DL TP < 250 Kbps (%)",              
        "250 Kbps < DL TP < 1 Mbps (%)",                  
        "DL TP > 1 Mbps (%)",                             
        "UL TP < 10 Kbps (%)",                           
        "10 Kbps < UL TP < 50 Kbps (%)",                 
        "50 Kbps < UL TP < 300 Kbps (%)",                 
        "UL TP > 300 Kbps (%)",                           
        "HTTP DL (Bytes)",                              
        "HTTP UL (Bytes)",                             
        "Nb of sec with 125000B < Vol DL",             
        "Nb of sec with 1250B < Vol UL < 6250B",        
        "Nb of sec with 31250B < Vol DL < 125000B",                   
        "Nb of sec with 6250B < Vol DL < 31250B",           
        "Nb of sec with Vol DL < 6250B",                  
        "Nb of sec with Vol UL < 1250B"                 
    ]

    return sd.impute_numeric_missing(cleaned_df, columns_to_impute)


imputed_df = impute_columns()

In [9]:
imputed_df.isnull().sum()

Bearer Id                                    990
Start                                          0
Start ms                                       0
End                                            0
End ms                                         0
Dur. (ms)                                      0
IMSI                                         569
MSISDN/Number                               1065
IMEI                                         571
Last Location Name                          1152
Avg RTT DL (ms)                                0
Avg RTT UL (ms)                                0
Avg Bearer TP DL (kbps)                        0
Avg Bearer TP UL (kbps)                        0
TCP DL Retrans. Vol (Bytes)                    0
TCP UL Retrans. Vol (Bytes)                    0
DL TP < 50 Kbps (%)                            0
50 Kbps < DL TP < 250 Kbps (%)                 0
250 Kbps < DL TP < 1 Mbps (%)                  0
DL TP > 1 Mbps (%)                             0
UL TP < 10 Kbps (%) 

### Replace columns with mode value

In [10]:
def replace_with_mode():
    columns_to_replace = [
        'Bearer Id',
        'IMSI',
        'MSISDN/Number',
        'IMEI',        
        'Last Location Name'
    ]

    return sd.replace_column_with_mode(imputed_df, columns_to_replace)

cleaned_df = replace_with_mode()

In [11]:
cleaned_df.isnull().sum()

Bearer Id                                     0
Start                                         0
Start ms                                      0
End                                           0
End ms                                        0
Dur. (ms)                                     0
IMSI                                          0
MSISDN/Number                                 0
IMEI                                          0
Last Location Name                            0
Avg RTT DL (ms)                               0
Avg RTT UL (ms)                               0
Avg Bearer TP DL (kbps)                       0
Avg Bearer TP UL (kbps)                       0
TCP DL Retrans. Vol (Bytes)                   0
TCP UL Retrans. Vol (Bytes)                   0
DL TP < 50 Kbps (%)                           0
50 Kbps < DL TP < 250 Kbps (%)                0
250 Kbps < DL TP < 1 Mbps (%)                 0
DL TP > 1 Mbps (%)                            0
UL TP < 10 Kbps (%)                     

### Remove outliers

In [12]:
def remove_outliers():
    columns = [
        "Avg RTT DL (ms)",                             
        "Avg RTT UL (ms)",                              
        "TCP DL Retrans. Vol (Bytes)",                  
        "TCP UL Retrans. Vol (Bytes)",                  
        "DL TP < 50 Kbps (%)",                            
        "50 Kbps < DL TP < 250 Kbps (%)",              
        "250 Kbps < DL TP < 1 Mbps (%)",                  
        "DL TP > 1 Mbps (%)",                             
        "UL TP < 10 Kbps (%)",                           
        "10 Kbps < UL TP < 50 Kbps (%)",                 
        "50 Kbps < UL TP < 300 Kbps (%)",                 
        "UL TP > 300 Kbps (%)",                           
        "HTTP DL (Bytes)",                              
        "HTTP UL (Bytes)",                             
        "Nb of sec with 125000B < Vol DL",             
        "Nb of sec with 1250B < Vol UL < 6250B",        
        "Nb of sec with 31250B < Vol DL < 125000B",                   
        "Nb of sec with 6250B < Vol DL < 31250B",          
        "Nb of sec with Vol DL < 6250B",                  
        "Nb of sec with Vol UL < 1250B"                 
    ]

    return sd.handle_outliers(cleaned_df, columns)

processed_df = remove_outliers()


In [13]:
processed_df.head()

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
5,1.311448e+19,4/12/2019 21:37,439.0,4/25/2019 8:08,553.0,1074638.0,208201400000000.0,33668190000.0,35298410000000.0,T89132C,...,21332918.0,2611113.0,7345361.0,6536448.0,710293538.0,13167974.0,654978277.0,4436057.0,30307754.0,754452212.0
10,7.277826e+18,4/15/2019 10:30,90.0,4/25/2019 6:14,775.0,848637.0,208209800000000.0,33665650000.0,35346810000000.0,T29911A,...,2595678.0,19385457.0,16973815.0,9805785.0,565283082.0,9640759.0,95716749.0,14831312.0,56559490.0,594967693.0
12,1.311448e+19,4/15/2019 12:20,729.0,4/25/2019 8:40,862.0,850766.0,208200300000000.0,33603290000.0,35665010000000.0,D76026B,...,15567567.0,10610680.0,12189103.0,10621276.0,766292761.0,3655164.0,34550147.0,11326781.0,39654040.0,809144948.0


### write processed data to database

In [14]:
wd.write_data(processed_df, 'processed_data')

NameError: name 'temp' is not defined