In [1]:
import pandas as pd
import warnings
from dateutil.parser import parse
warnings.filterwarnings('ignore')
import os.path

### Load the buildings and cpq data

In [2]:
PATH = '../data/'
PROCESSED_PATH = '../processedData/'

cpq = pd.read_csv(PATH + 'ZayoHackathonData_CPQs.csv')
buildings = pd.read_csv(PATH + 'ZayoHackathonData_Buildings.csv')
opportunities = pd.read_csv(PATH + 'ZayoHackathonData_Opportunities.csv')
accounts = pd.read_csv(PATH + 'ZayoHackathonData_Accounts.csv')
sites = pd.read_csv(PATH + 'ZayoHackathonData_Sites.csv')

### drop rows that have duplicates on 'Account ID', 'Product Group', 'Building ID'

In [3]:
cpq['CreatedDate'] = cpq['CreatedDate'].apply(lambda x : parse(x))
cpq.sort_values(by='CreatedDate', inplace=True)
cpq.drop_duplicates(['Account ID', 'Product Group', 'Building ID'], inplace=True)
opportunities.drop_duplicates(['Account ID', 'Product Group', 'Building ID'], inplace=True)

### Get the count of records by the attribute 'On Zayo Network Status'

In [4]:
cpq['On Zayo Network Status'].value_counts()

On Zayo Network        5624
Not on Zayo Network    1214
Build in Progress        82
Name: On Zayo Network Status, dtype: int64

In [5]:
cpq.columns

Index(['CPQ ID', 'Account ID', 'CreatedDate', 'Product Group',
       ' X36 MRC List ', ' X36 NRR List ', ' X36 NPV List ', 'Building ID',
       'Market', 'Street Address', 'City', 'State', 'Postal Code',
       'Network Proximity', 'On Zayo Network Status'],
      dtype='object')

### Merge the cpq and buildings by doing an inner join

In [6]:
cpq_business = pd.merge(cpq, buildings, how='inner')

In [7]:
cpq_business.head()

Unnamed: 0,CPQ ID,Account ID,CreatedDate,Product Group,X36 MRC List,X36 NRR List,X36 NPV List,Building ID,Market,Street Address,City,State,Postal Code,Network Proximity,On Zayo Network Status,Latitude,Longitude,Net Classification,Type,Estimated Build Cost
0,CPQ-009438,Acct-000469,2016-07-01,Wavelengths - Metro,"$2,320.52","$3,227.00","$37,502.34",Bldg-065056,Dallas,2323 Bryan St,Dallas,TX,75201,64.67,On Zayo Network,32.787255,-96.794118,Fiber Only,Carrier Hotel,"$15,880.20"
1,CPQ-009494,Acct-000700,2016-07-05,Wavelengths - Long Haul,"$2,791.26","$2,766.00",$-,Bldg-065056,Dallas,2323 Bryan St,Dallas,TX,75201,64.67,On Zayo Network,32.787255,-96.794118,Fiber Only,Carrier Hotel,"$15,880.20"
2,CPQ-009517,Acct-000043,2016-07-06,Ethernet,"$1,078.38",$922.00,"$21,228.45",Bldg-065056,Dallas,2323 Bryan St,Dallas,TX,75201,64.67,On Zayo Network,32.787255,-96.794118,Fiber Only,Carrier Hotel,"$15,880.20"
3,CPQ-009482,Acct-000069,2016-07-06,Ethernet,$910.98,$922.00,"$16,035.62",Bldg-065056,Dallas,2323 Bryan St,Dallas,TX,75201,64.67,On Zayo Network,32.787255,-96.794118,Fiber Only,Carrier Hotel,"$15,880.20"
4,CPQ-009474,Acct-000019,2016-07-06,Wavelengths - Metro,"$1,423.57","$2,766.00","$24,683.18",Bldg-065056,Dallas,2323 Bryan St,Dallas,TX,75201,64.67,On Zayo Network,32.787255,-96.794118,Fiber Only,Carrier Hotel,"$15,880.20"


In [8]:
# len(cpq.loc[cpq['Building ID'] == 'Bldg-108671']), len(cpq_business.loc[cpq_business['Building ID'] == 'Bldg-108671'])

### converting the building ID to string values

In [9]:
cpq_business['Building ID'] = cpq_business['Building ID'].apply(str)
cpq_business['Building ID'].unique()

array(['Bldg-065056', 'Bldg-012582', 'Bldg-038069', ..., 'Bldg-122081',
       'Bldg-115638', 'Bldg-525138'], dtype=object)

### Select only those records that are not on Zayo Network

In [10]:
cpq_status = cpq_business.loc[cpq_business['On Zayo Network Status'] == 'Not on Zayo Network']

In [11]:
cpq_status.rename(columns={' X36 MRC List ': 'X36 MRC', ' X36 NRR List ': 'X36 NRR', ' X36 NPV List ': 'X36 NPV'}, inplace=True)

### Get price in numbers

In [12]:
cpq_status['X36 MRC'] = cpq_status['X36 MRC'].replace('[\$,)]','',regex=True).astype(float)
cpq_status['X36 NRR'] = cpq_status['X36 NRR'].str.replace(r'[$,]', '')
cpq_status['X36 NRR'] = cpq_status['X36 NRR'].str.replace('-', '0')
cpq_status['X36 NPV'] = cpq_status['X36 NPV'].str.replace(r'[$,]', '').replace('-', '0')
cpq_status['X36 NPV'] = cpq_status['X36 NPV'].str.replace('-', '0')
cpq_status[['X36 NRR','X36 NPV']] = cpq_status[['X36 NRR','X36 NPV']].apply(pd.to_numeric)
cpq_status[' Estimated Build Cost '] = cpq_status[' Estimated Build Cost '].replace('[\$,)]','',regex=True).astype(float)

cpq_status.head()

Unnamed: 0,CPQ ID,Account ID,CreatedDate,Product Group,X36 MRC,X36 NRR,X36 NPV,Building ID,Market,Street Address,City,State,Postal Code,Network Proximity,On Zayo Network Status,Latitude,Longitude,Net Classification,Type,Estimated Build Cost
497,CPQ-009426,Acct-000024,2016-07-01,IP Services,3469.99,922.0,28433.59,Bldg-449138,Dallas,15050 Trinity Blvd,Fort Worth,TX,76155,1795.08,Not on Zayo Network,32.820624,-97.040988,Fiber Only,Office - Multi Tenant,56877.0
1501,CPQ-009421,Acct-000039,2016-07-01,Ethernet,1543.49,461.0,10914.5,Bldg-133508,Dallas,5101 Statesman Dr,Irving,TX,75063,607.02,Not on Zayo Network,32.935867,-97.017868,On-Net,Office - Single Tenant,27175.5
1502,CPQ-009604,Acct-000074,2016-07-06,Ethernet,1543.49,461.0,10914.5,Bldg-133508,Dallas,5101 Statesman Dr,Irving,TX,75063,607.02,Not on Zayo Network,32.935867,-97.017868,On-Net,Office - Single Tenant,27175.5
1503,CPQ-010600,Acct-000114,2016-08-02,Ethernet,1543.49,461.0,10914.5,Bldg-133508,Dallas,5101 Statesman Dr,Irving,TX,75063,607.02,Not on Zayo Network,32.935867,-97.017868,On-Net,Office - Single Tenant,27175.5
1504,CPQ-001441,Acct-000034,2016-11-03,Ethernet,1543.49,461.0,10914.5,Bldg-133508,Dallas,5101 Statesman Dr,Irving,TX,75063,607.02,Not on Zayo Network,32.935867,-97.017868,On-Net,Office - Single Tenant,27175.5


### find number of distinct buildings on Zayo network

In [13]:
len(cpq_status['Building ID'].unique())

794

### Filter out the records based on the state

In [14]:
cpq_CO = cpq_status.loc[cpq_status['State'] == 'CO']
cpq_TX = cpq_status.loc[cpq_status['State'] == 'TX']
cpq_GA = cpq_status.loc[cpq_status['State'] == 'GA']

### For each individual state, calculate the profit incurred by each building

In [15]:
CO_profit = cpq_CO.groupby(cpq_CO['Building ID'])['X36 NPV'].sum().reset_index()
CO_profit.sort_values(by='Building ID', inplace=True)
TX_profit = cpq_TX.groupby(cpq_TX['Building ID'])['X36 NPV'].sum().reset_index()
TX_profit.sort_values(by='Building ID', inplace=True)
GA_profit = cpq_GA.groupby(cpq_GA['Building ID'])['X36 NPV'].sum().reset_index()
GA_profit.sort_values(by='Building ID', inplace=True)

### For each building, get the total number of accounts associated with the building

In [16]:
CO_accounts = cpq_CO.groupby(cpq_CO['Building ID'])['Account ID'].count().reset_index()
CO_accounts.sort_values(by='Building ID', inplace=True)
TX_accounts = cpq_TX.groupby(cpq_TX['Building ID'])['Account ID'].count().reset_index()
TX_accounts.sort_values(by='Building ID', inplace=True)
GA_accounts = cpq_GA.groupby(cpq_GA['Building ID'])['Account ID'].count().reset_index()
GA_accounts.sort_values(by='Building ID', inplace=True)

In [17]:
CO_accounts.head()

Unnamed: 0,Building ID,Account ID
0,Bldg-059940,2
1,Bldg-068378,1
2,Bldg-074834,1
3,Bldg-081662,4
4,Bldg-082112,3


### Get the estimated build cost for each state

In [18]:
build_cost_CO = cpq_CO.groupby(['Building ID',' Estimated Build Cost ']).size().reset_index().rename(columns={0:'count'})
build_cost_TX = cpq_TX.groupby(['Building ID',' Estimated Build Cost ']).size().reset_index().rename(columns={0:'count'})
build_cost_GA = cpq_GA.groupby(['Building ID',' Estimated Build Cost ']).size().reset_index().rename(columns={0:'count'})

### Total profit = Profit incurred by each building - Estimated build cost

In [19]:
CO_profit['Estimated Build Cost'] = build_cost_CO[' Estimated Build Cost ']
CO_profit['Profit Including Build Cost'] = CO_profit['X36 NPV'] - CO_profit['Estimated Build Cost']
CO_profit['Number of Accounts'] = CO_accounts['Account ID']

In [20]:
TX_profit['Estimated Build Cost'] = build_cost_TX[' Estimated Build Cost ']
TX_profit['Profit Including Build Cost'] = TX_profit['X36 NPV'] - TX_profit['Estimated Build Cost']
TX_profit['Number of Accounts'] = TX_accounts['Account ID']

In [21]:
GA_profit['Estimated Build Cost'] = build_cost_GA[' Estimated Build Cost ']
GA_profit['Profit Including Build Cost'] = GA_profit['X36 NPV'] - GA_profit['Estimated Build Cost']
GA_profit['Number of Accounts'] = GA_accounts['Account ID']

In [22]:
CO_profit.sort_values(by='Profit Including Build Cost', ascending=False, inplace=True)
TX_profit.sort_values(by='Profit Including Build Cost', ascending=False, inplace=True)
GA_profit.sort_values(by='Profit Including Build Cost', ascending=False, inplace=True)

In [23]:
CO_profit.head()

Unnamed: 0,Building ID,X36 NPV,Estimated Build Cost,Profit Including Build Cost,Number of Accounts
160,Bldg-430143,480159.37,35981.5,444177.87,4
253,Bldg-519433,357294.76,14366.5,342928.26,1
86,Bldg-158872,319506.62,22971.4,296535.22,2
81,Bldg-158866,334458.16,49844.5,284613.66,3
128,Bldg-266860,242771.91,18408.4,224363.51,5


In [24]:
TX_profit.head()

Unnamed: 0,Building ID,X36 NPV,Estimated Build Cost,Profit Including Build Cost,Number of Accounts
114,Bldg-386004,392308.73,62026.25,330282.48,7
21,Bldg-118398,277511.02,25200.6,252310.42,1
94,Bldg-293930,254597.79,19360.2,235237.59,3
58,Bldg-139825,262124.91,29809.8,232315.11,16
105,Bldg-366026,260224.38,37675.0,222549.38,1


In [25]:
GA_profit.head()

Unnamed: 0,Building ID,X36 NPV,Estimated Build Cost,Profit Including Build Cost,Number of Accounts
57,Bldg-115684,632439.0,24570.7,607868.3,18
100,Bldg-233683,508875.91,69739.12,439136.79,9
33,Bldg-070188,426020.76,67006.72,359014.04,3
32,Bldg-068834,386640.77,65354.08,321286.69,9
54,Bldg-115607,330542.8,18107.8,312435.0,1


### Merge the profits dataframe with the original datafrae

In [26]:
co_buildings_latlong = pd.merge(CO_profit, cpq_status, on='Building ID', how='inner')
tx_buildings_latlong = pd.merge(TX_profit, cpq_status, on='Building ID', how='inner')
ga_buildings_latlong = pd.merge(GA_profit, cpq_status, on='Building ID', how='inner')

In [27]:
co_buildings_latlong.head()

Unnamed: 0,Building ID,X36 NPV_x,Estimated Build Cost,Profit Including Build Cost,Number of Accounts,CPQ ID,Account ID,CreatedDate,Product Group,X36 MRC,...,City,State,Postal Code,Network Proximity,On Zayo Network Status,Latitude,Longitude,Net Classification,Type,Estimated Build Cost.1
0,Bldg-430143,480159.37,35981.5,444177.87,4,CPQ-012244,Acct-000397,2016-09-08,Dark Fiber - Metro,7507.43,...,Denver,CO,80202,786.05,Not on Zayo Network,39.753224,-105.000157,On-Net,Office - Multi Tenant,35981.5
1,Bldg-430143,480159.37,35981.5,444177.87,4,CPQ-012406,Acct-000350,2016-09-13,Dark Fiber - Metro,7507.43,...,Denver,CO,80202,786.05,Not on Zayo Network,39.753224,-105.000157,On-Net,Office - Multi Tenant,35981.5
2,Bldg-430143,480159.37,35981.5,444177.87,4,CPQ-005682,Acct-000169,2017-01-05,Dark Fiber - Metro,7507.43,...,Denver,CO,80202,786.05,Not on Zayo Network,39.753224,-105.000157,On-Net,Office - Multi Tenant,35981.5
3,Bldg-430143,480159.37,35981.5,444177.87,4,CPQ-005767,Acct-000274,2017-01-06,Dark Fiber - Metro,7507.43,...,Denver,CO,80202,786.05,Not on Zayo Network,39.753224,-105.000157,On-Net,Office - Multi Tenant,35981.5
4,Bldg-519433,357294.76,14366.5,342928.26,1,CPQ-006575,Acct-000397,2017-02-28,Dark Fiber - Metro,15901.9,...,Denver,CO,80223,43.7,Not on Zayo Network,39.857676,-104.583343,Fiber Only,Splice Point,14366.5


### Generate the profits csv by combining the state dataframes. This csv would be used to generate the data table for the first visualization

In [28]:
co_buildings_latlong = co_buildings_latlong[['Building ID', 'X36 NPV_x', 'Estimated Build Cost', 'Profit Including Build Cost',
                                            'Latitude', 'Longitude', 'State', 'Number of Accounts', 'Street Address',
                                            'Postal Code', 'Net Classification', 'Type']]
co_buildings_latlong.drop_duplicates(['Building ID'], inplace=True)
tx_buildings_latlong = tx_buildings_latlong[['Building ID', 'X36 NPV_x', 'Estimated Build Cost', 'Profit Including Build Cost',
                                            'Latitude', 'Longitude', 'State', 'Number of Accounts', 'Street Address',
                                            'Postal Code', 'Net Classification', 'Type']]
tx_buildings_latlong.drop_duplicates(['Building ID'], inplace=True)
ga_buildings_latlong = ga_buildings_latlong[['Building ID', 'X36 NPV_x', 'Estimated Build Cost', 'Profit Including Build Cost',
                                            'Latitude', 'Longitude', 'State', 'Number of Accounts', 'Street Address',
                                            'Postal Code', 'Net Classification', 'Type']]
ga_buildings_latlong.drop_duplicates(['Building ID'], inplace=True)

In [29]:
profits = pd.concat([co_buildings_latlong, tx_buildings_latlong, ga_buildings_latlong])

In [30]:
profits.to_csv('profits.csv')

### Identify the total number of buildings for each state and write that to a csv

In [31]:
co_sum = CO_profit['Profit Including Build Cost'].sum()
buildings_co = len(CO_profit)

In [32]:
tx_sum = TX_profit['Profit Including Build Cost'].sum()
buildings_tx = len(TX_profit)

In [33]:
ga_sum = GA_profit['Profit Including Build Cost'].sum()
buildings_ga = len(GA_profit)

In [34]:
import csv
vals = [['CO', co_sum, buildings_co], ['TX', tx_sum, buildings_tx], ['GA', ga_sum, buildings_ga]]

with open('profit_by_state.csv','w') as f:
    w = csv.writer(f)
    w.writerow(['State','Total Profit', 'Number of Buildings'])
    for v in vals:
        w.writerow(v)

### getting a glimpse of the data in the opportunities file

In [35]:
opportunities.head(3)

Unnamed: 0,Opportunity ID,Account ID,StageName,IsClosed,IsWon,CreatedDate,Term in Months,Service,Opportunity Type,Product Group,Building ID,Market,Street Address,City,State,Postal Code,Network Proximity,On Zayo Network Status
0,Opp-000001,Acct-000007,Closed - Lost,True,False,7/1/16,12.0,,New Service,Ethernet,Bldg-012582,Atlanta,56 Marietta St NW,Atlanta,GA,30303,66.45,On Zayo Network
1,Opp-000002,Acct-000986,5 - Accepted,True,True,7/1/16,60.0,,New Service,Dark Fiber - Metro,Bldg-016855,Atlanta,300 Satellite Blvd NW,Suwanee,GA,30024,374.79,On Zayo Network
2,Opp-000002,Acct-000986,5 - Accepted,True,True,7/1/16,60.0,,New Service,Dark Fiber - Metro,Bldg-109003,Atlanta,305 Satellite Blvd NW,Suwanee,GA,30024,250.74,On Zayo Network


### isolating opportunities that haven't been 'Closed - Lost'

In [36]:
opportunities_not_lost = opportunities[opportunities['StageName'] != 'Closed - Lost'].groupby('Account ID')['Building ID'].count().reset_index()

### count the number of buildings on every account grouped by network status

In [37]:
count_bldg_opps = opportunities.groupby(['Account ID','On Zayo Network Status'])['Building ID'].count()
count_bldg_opps = count_bldg_opps.reset_index()

In [38]:
count_bldg_opps.head()

Unnamed: 0,Account ID,On Zayo Network Status,Building ID
0,Acct-000001,Not on Zayo Network,2
1,Acct-000001,On Zayo Network,2
2,Acct-000002,Build in Progress,1
3,Acct-000002,Not on Zayo Network,7
4,Acct-000002,On Zayo Network,68


### converting Total BRR from strings to floats

In [39]:
accounts[' Total BRR '] = accounts[' Total BRR '].map(lambda tbrr: float(tbrr.split('$')[1]
                                                                         .replace(',','')
                                                                         .replace('-','0.0')))

accounts[' Total BRR '] = accounts[' Total BRR '].astype(float)

In [40]:
accounts.head()

Unnamed: 0,Account ID,Industry,Vertical,Total BRR,AnnualRevenue,NumberOfEmployees,DandB Revenue,DandB Total Employees
0,Acct-000273,Telecommunications,Wireless,6084906.0,"$248,747,365.62",397,"$356,943,618.45",397
1,Acct-000013,Telecommunications,National Carrier/ISP,5117529.0,"$10,779,768,901.96",40000,"$10,779,768,901.96",40000
2,Acct-000036,Telecommunications,Wireless,5105015.0,"$9,277,969,886.09",34518,"$19,094,310,655.17",34518
3,Acct-000025,Telecommunications,National Carrier/ISP,4234871.0,"$6,248,394,000.00",10040,"$7,587,138,000.00",12500
4,Acct-000004,Telecommunications,Wireless,3510947.0,"$29,669,960,000.00",30000,"$32,588,090,000.00",30000


### isolating accounts having Total BRR >= 500,00 (high-revenue accounts) and On Zayo Network

In [41]:
high_rev_acc = accounts[accounts[' Total BRR '] >= 500000]
high_rev_acc_opps = pd.merge(high_rev_acc, count_bldg_opps, on=['Account ID'], how='inner')
high_rev_acc_opps_net = pd.DataFrame(high_rev_acc_opps[high_rev_acc_opps['On Zayo Network Status'] == 'On Zayo Network'])

In [42]:
high_rev_acc_opps_net.drop([' DandB Revenue ', 'DandB Total Employees'], axis=1, inplace=True)

In [43]:
high_rev_acc_opps_net.columns

Index(['Account ID', 'Industry', 'Vertical', ' Total BRR ', ' AnnualRevenue ',
       'NumberOfEmployees', 'On Zayo Network Status', 'Building ID'],
      dtype='object')

In [44]:
high_rev_acc_opps_net.rename(columns={'Building ID': 'Total Buildings'}, inplace=True)

### sorting by number of on-net buildings

In [45]:
high_rev_acc_opps_net.sort_values(by='Total Buildings', ascending=False, inplace=True)

In [46]:
high_rev_acc_opps_net.head()

Unnamed: 0,Account ID,Industry,Vertical,Total BRR,AnnualRevenue,NumberOfEmployees,On Zayo Network Status,Total Buildings
20,Acct-000009,Telecommunications,National Carrier/ISP,1915889.0,"$5,374,799,000.00",12626,On Zayo Network,81
5,Acct-000013,Telecommunications,National Carrier/ISP,5117529.0,"$10,779,768,901.96",40000,On Zayo Network,74
15,Acct-000002,Telecommunications,National Carrier/ISP,3344939.0,"$25,378,360,491.80",43350,On Zayo Network,68
26,Acct-000072,Telecommunications,National Carrier/ISP,1307326.0,"$109,048,721.12",431,On Zayo Network,48
17,Acct-000053,Telecommunications,National Carrier/ISP,2950069.0,"$16,503,800,000.00",43000,On Zayo Network,45


In [47]:
high_rev_acc_opps_net.shape

(30, 8)

### saving to csv for use in visualization

In [48]:
saveToFile = os.path.join(PROCESSED_PATH, 'Opportunities_with_Current_High_Revenue_Accounts_On_Net.csv')
high_rev_acc_opps_net.to_csv(saveToFile, index = False)

### isolating accounts with no BRR (potential accounts)

In [49]:
zero_rev_acc = accounts[accounts[' Total BRR '] == 0]
zero_rev_acc_opps = pd.merge(zero_rev_acc, opportunities_not_lost,\
                                    on=['Account ID'],\
                                    how='inner').sort_values(by='Building ID', ascending=False)

In [50]:
zero_rev_acc_opps.columns

Index(['Account ID', 'Industry', 'Vertical', ' Total BRR ', ' AnnualRevenue ',
       'NumberOfEmployees', ' DandB Revenue ', 'DandB Total Employees',
       'Building ID'],
      dtype='object')

In [51]:
zero_rev_acc_opps.drop(labels=[' Total BRR ', ' AnnualRevenue ',
       'NumberOfEmployees', ' DandB Revenue ', 'DandB Total Employees'], axis=1, inplace=True)

In [52]:
zero_rev_acc_opps.rename(columns={'Building ID': 'Total Buildings'}, inplace=True)

### sorting by number of on-net buildings

In [53]:
zero_rev_acc_opps.sort_values(by='Total Buildings', ascending=False, inplace=True)

In [54]:
zero_rev_acc_opps.head()

Unnamed: 0,Account ID,Industry,Vertical,Total Buildings
71,Acct-000245,Telecommunications,Reseller,180
156,Acct-001289,Research & Education,K-12 Education,100
83,Acct-001241,Research & Education,K-12 Education,78
155,Acct-001285,Research & Education,K-12 Education,52
154,Acct-001284,Research & Education,K-12 Education,41


### saving to csv for use in visualization

In [55]:
saveToFile = os.path.join(PROCESSED_PATH, 'Opportunities_with_Potential_Accounts.csv')
zero_rev_acc_opps.to_csv(saveToFile, index = False)

### isolating on-net sites

In [56]:
sites_on_net = sites[sites['On Zayo Network Status'] != 'Not on Zayo Network'].groupby(['Account ID'])['Building ID'].count().reset_index().sort_values(by='Building ID', ascending=False)

In [57]:
sites_on_net.head()

Unnamed: 0,Account ID,Building ID
174,Acct-000273,970
7,Acct-000008,704
3,Acct-000004,446
77,Acct-000122,367
420,Acct-000700,321


### isolating sites not on net

In [58]:
sites_no_net = sites[sites['On Zayo Network Status'] == 'Not on Zayo Network'].groupby(['Account ID'])['Building ID'].count().reset_index().sort_values(by='Building ID', ascending=False)

In [59]:
sites_no_net.head()

Unnamed: 0,Account ID,Building ID
139,Acct-000700,2188
249,Acct-001366,1463
145,Acct-000730,643
0,Acct-000002,588
13,Acct-000036,587


### renaming columns to something more meaningful

In [60]:
sites_on_net.rename(columns={'Building ID': '# Buildings on Net'}, inplace=True)
sites_no_net.rename(columns={'Building ID': '# Buildings not on Net'}, inplace=True)

### getting the total number of on-net and off-net buildings associated with every account

In [61]:
potential_accounts_buildings_info = (pd.merge(sites_on_net, sites_no_net,\
         on=['Account ID'],\
         how='outer').sort_values(by='# Buildings on Net', ascending=False)).fillna(0)

potential_accounts_buildings_info['# Buildings not on Net'] = potential_accounts_buildings_info['# Buildings not on Net'].astype(int)

### extending accounts.csv to have 2 more columns: number of buildings on-net and number off-net

In [62]:
potential_accounts_buildings_info_tbrr = pd.merge(accounts, potential_accounts_buildings_info,\
         on=['Account ID'],\
         how='inner')

### isolating only those accounts with Total BRR > 0 and dropping unwanted columns

In [63]:
potential_accounts_buildings_info_tbrr = pd.DataFrame(potential_accounts_buildings_info_tbrr\
                                                      [potential_accounts_buildings_info_tbrr[' Total BRR '] > 0])

potential_accounts_buildings_info_tbrr.drop([' AnnualRevenue ','NumberOfEmployees',\
                                             ' DandB Revenue ', 'DandB Total Employees'], axis=1, inplace=True)

In [64]:
potential_accounts_buildings_info_tbrr.head()

Unnamed: 0,Account ID,Industry,Vertical,Total BRR,# Buildings on Net,# Buildings not on Net
0,Acct-000273,Telecommunications,Wireless,6084906.0,970.0,45
1,Acct-000013,Telecommunications,National Carrier/ISP,5117529.0,188.0,17
2,Acct-000036,Telecommunications,Wireless,5105015.0,75.0,587
3,Acct-000025,Telecommunications,National Carrier/ISP,4234871.0,228.0,32
4,Acct-000004,Telecommunications,Wireless,3510947.0,446.0,213


### sorting by number of on-net buildings

In [65]:
potential_accounts_buildings_info_tbrr_temp = potential_accounts_buildings_info_tbrr[potential_accounts_buildings_info_tbrr['# Buildings not on Net'] > 0] 
potential_accounts_buildings_info_tbrr_temp.sort_values(by='# Buildings on Net', ascending=False, inplace=True)

### saving to csv for use in visualization

In [66]:
saveToFile = os.path.join(PROCESSED_PATH, 'Current_Accounts_Untapped_Buildings_On_Net.csv')
potential_accounts_buildings_info_tbrr_temp.to_csv(saveToFile, index = False)