In [1]:
#import libs
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

In [2]:
#Merge CSVs
all_files = glob.glob(os.path.join("../data/raw/SiteKpi/*.csv"))
df_from_each_file = (pd.read_csv(f, sep=',') for f in all_files)
df_merged   = pd.concat(df_from_each_file, ignore_index=True)

In [3]:
df = df_merged.copy()
df.head()

Unnamed: 0,Time,eNodeB Name,Integrity,EPM_RRC_SR,EPM_UL_Traffic_GB,EPM_ERAB_DR,EPM_ERAB_SR,EPM_DL_Traffic_GB,EPM_CSFB_Preparation_SR,EPM_CSFB_SR,...,TE Availabity Daily,EPM_Dropped_Sessions,L.Cell.Unavail.Dur.Manual(s),L.Cell.Unavail.Dur.Sys(s),L.Cell.Avail.Dur(s),EPM_RRC_Failures,EPM_ERABSR_Failures,EPM_Total_Sessions,EPM_ERABSR_Attempts,EPM_RRC_Attempts
0,4/1/2022 0:00,LCAIW30340_Site_83,100%,99.9821,0.2975,0.017,99.9654,4.0582,100.0,100.0,...,100.0,1,0,0,32400,1,2,5888,5783,5601
1,4/1/2022 0:00,LCAIE10014_Site_210,100%,100.0,0.3741,0.1565,99.9617,4.6417,100.0,100.0,...,100.0,8,0,0,21600,0,2,5112,5222,4836
2,4/1/2022 0:00,LCAIE10010_Nasr City 2 LE,100%,99.9847,0.5429,0.1722,99.8999,5.3775,100.0,100.0,...,100.0,12,0,0,21600,1,7,6970,6994,6540
3,4/1/2022 0:00,LCAIN20172_Sheheby,100%,100.0,0.0593,0.5115,99.9349,3.2884,100.0,100.0,...,100.0,8,0,0,32400,0,1,1564,1535,1494
4,4/1/2022 0:00,LCAIE30093_01-3-14-73,100%,100.0,0.5062,0.0844,99.3691,6.5867,100.0,100.0,...,100.0,4,0,0,21600,0,30,4739,4755,4428


In [4]:
print("Total Number of Rows : {}".format(df.shape[0]))
print("Total Number of Features : {}".format(df.shape[1]))

Total Number of Rows : 1048569
Total Number of Features : 21


In [5]:
# Show uniqe values of each column
for i in range(df.shape[1]):
    print("###########################     {}    ###########################".format(df.columns[i]))
    print(df.iloc[:,i].unique())
    print(" ")
    print(" ")

###########################     Time    ###########################
['4/1/2022 0:00' '4/1/2022 1:00' '4/1/2022 2:00' '4/1/2022 3:00'
 '4/1/2022 4:00' '4/1/2022 5:00' '4/1/2022 6:00' '4/1/2022 7:00'
 '4/1/2022 8:00' '4/1/2022 9:00' '4/1/2022 10:00' '4/1/2022 11:00'
 '4/1/2022 12:00' '4/1/2022 13:00' '4/1/2022 14:00' '4/1/2022 15:00'
 '4/1/2022 16:00' '4/1/2022 17:00' '4/1/2022 18:00' '4/1/2022 19:00'
 '4/1/2022 20:00' '4/1/2022 21:00' '4/1/2022 22:00' '4/1/2022 23:00'
 '4/2/2022 0:00' '4/2/2022 1:00' '4/2/2022 2:00' '4/2/2022 3:00'
 '4/2/2022 4:00' '4/2/2022 5:00' '4/2/2022 6:00' '4/2/2022 7:00'
 '4/2/2022 8:00' '4/2/2022 9:00' '4/2/2022 10:00' '4/2/2022 11:00'
 '4/2/2022 12:00' '4/2/2022 13:00' '4/2/2022 14:00' '4/2/2022 15:00'
 '4/2/2022 16:00' '4/2/2022 17:00' '4/2/2022 18:00' '4/2/2022 19:00'
 '4/2/2022 20:00' '4/2/2022 21:00' '4/2/2022 22:00' '4/2/2022 23:00'
 '4/3/2022 0:00' '4/3/2022 1:00' '4/3/2022 2:00' '4/3/2022 3:00'
 '4/3/2022 4:00' '4/3/2022 5:00' '4/3/2022 6:00' '4/3/2022 

In [6]:
#Check Missing Values
def check_missing(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data.head(20)

check_missing(df)

Unnamed: 0,Total,Percent
EPM_CSFB_SR,50333,0.048002
EPM_CSFB_Preparation_SR,50289,0.04796
EPM_RRC_SR,14045,0.013394
EPM_ERAB_SR,12472,0.011894
EPM_ERAB_DR,10565,0.010076
Time,0,0.0
L.Cell.Unavail.Dur.Manual(s),0,0.0
EPM_ERABSR_Attempts,0,0.0
EPM_Total_Sessions,0,0.0
EPM_ERABSR_Failures,0,0.0


In [8]:
#check duplicate rows
def check_duplicate(data):
    duplicate = data.duplicated()
    unique_data  = pd.Series(data.shape[0] - duplicate.value_counts())
    duplicate_data = pd.Series(data.shape[0] - duplicate.value_counts()[0])
    unique_data_percent = pd.Series((unique_data/data.shape[0])*100)
    duplicate_data_percent = pd.Series((duplicate_data/data.shape[0])*100)
    dub = pd.concat([unique_data, duplicate_data,unique_data_percent,duplicate_data_percent], axis=1, keys=['Unique Count', 'Duplicate Count','Unique percent','Duplicate Percent'])
    return dub

check_duplicate(df)

Unnamed: 0,Unique Count,Duplicate Count,Unique percent,Duplicate Percent
False,0,0,0.0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048569 entries, 0 to 1048568
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   Time                          1048569 non-null  object 
 1   eNodeB Name                   1048569 non-null  object 
 2   Integrity                     1048569 non-null  object 
 3   EPM_RRC_SR                    1034524 non-null  float64
 4   EPM_UL_Traffic_GB             1048569 non-null  float64
 5   EPM_ERAB_DR                   1038004 non-null  object 
 6   EPM_ERAB_SR                   1036097 non-null  object 
 7   EPM_DL_Traffic_GB             1048569 non-null  float64
 8   EPM_CSFB_Preparation_SR       998280 non-null   float64
 9   EPM_CSFB_SR                   998236 non-null   float64
 10  TE Hourly Avala               1048569 non-null  float64
 11  TE Availabity Daily           1048569 non-null  float64
 12  EPM_Dropped_Sessions        

In [10]:
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i,j])>0.9:
            col_name = correlation_matrix.columns[i]
            correlated_features.add(col_name)
print(correlated_features)

{'TE Availabity Daily'}


# Conclusion and Recommended actions

- there are missing values should deal with them
- Time should be in datetime format not object format
- Some columns should be float or int not object