In [4]:
import pandas as pd
import sys, os

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from utils import DataCleaner, DatabaseEngine

In [5]:
# creating a dataframe from the xdr_data table in the database
db_engine = DatabaseEngine()
engine = db_engine.create()
df = pd.read_sql_table("xdr_data", con=engine)
df.head()

Successfully connected to the PostgreSQL "telecom" database


Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [6]:
# checking if there is a missing value in the dataframe

df.isnull().sum().sum()


1031392

#### Cleaning the dataframe and put it on a database

In [8]:
# handling missing values from the dataframe
cleaner = DataCleaner(df)
cleaned_df = cleaner.clean()

# put the clean dataframe into the database
cleaned_df.to_sql("clean_xdr_data", con=engine, if_exists="replace", index=False)

312

In [9]:
# creating a dataframe from the cleaned xdr_data table in the database
clean_df = pd.read_sql_table("clean_xdr_data", con=engine)
# clean_df.head()

clean_df.isnull().sum().sum()

0

#### User Overview Analysis

In [9]:
# identifying the top 10 handsets used by the customers
filtered_handsets = clean_df[clean_df["Handset Type"] != "undefined"]
top_10_handsets = filtered_handsets["Handset Type"].value_counts().head(10)
pd.DataFrame(top_10_handsets)

Unnamed: 0_level_0,count
Handset Type,Unnamed: 1_level_1
Huawei B528S-23A,20323
Apple iPhone 6S (A1688),9419
Apple iPhone 6 (A1586),9023
Apple iPhone 7 (A1778),6326
Apple iPhone Se (A1723),5187
Apple iPhone 8 (A1905),4993
Apple iPhone Xr (A2105),4568
Samsung Galaxy S8 (Sm-G950F),4520
Apple iPhone X (A1901),3813
Samsung Galaxy A5 Sm-A520F,3724


In [10]:
# identifying the top 3 handset manufacturers
top_3_manufacturers = clean_df["Handset Manufacturer"].value_counts().head(3)
pd.DataFrame(top_3_manufacturers)

Unnamed: 0_level_0,count
Handset Manufacturer,Unnamed: 1_level_1
Apple,60136
Samsung,40839
Huawei,34423


In [11]:
# identify the top 5 handsets for the top 3 handset manufacturer
top_3_manufacturers = ['Apple', 'Samsung', 'Huawei']

results = []

for manufacturer in top_3_manufacturers:
    top_5_handsets = clean_df[clean_df["Handset Manufacturer"] == manufacturer]["Handset Type"].value_counts().head(5)
    temp_df = pd.DataFrame({'Manufacturer': manufacturer, 'Handset Type': top_5_handsets.index, 'Count': top_5_handsets.values})
    results.append(temp_df)

# Concatenate results into a single DataFrame
results_df = pd.concat(results, ignore_index=True)

pd.DataFrame(results_df)

Unnamed: 0,Manufacturer,Handset Type,Count
0,Apple,Apple iPhone 6S (A1688),9419
1,Apple,Apple iPhone 6 (A1586),9023
2,Apple,Apple iPhone 7 (A1778),6326
3,Apple,Apple iPhone Se (A1723),5187
4,Apple,Apple iPhone 8 (A1905),4993
5,Samsung,Samsung Galaxy S8 (Sm-G950F),4520
6,Samsung,Samsung Galaxy A5 Sm-A520F,3724
7,Samsung,Samsung Galaxy J5 (Sm-J530),3696
8,Samsung,Samsung Galaxy J3 (Sm-J330),3484
9,Samsung,Samsung Galaxy S7 (Sm-G930X),3199
