# Importing necessary dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import matplotlib.patches as patches

## Importing all 15 csv files

In [2]:
file_path = r'G:\DataScience\Data Insight\NAICS\csv_files'
all_csv_files = glob.glob(file_path + '/*.csv')

csv_list = []

for csv_file in all_csv_files:
    combined_dataframe = pd.read_csv(csv_file, index_col = None, header = 0)
    csv_list.append(combined_dataframe)
    
data = pd.concat(csv_list, axis = 0, ignore_index = True)
data

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
0,2000,1,Accommodation and food services [72],148000
1,2000,1,"Administrative and support, waste management a...",59250
2,2000,1,"Agriculture, forestry, fishing and hunting [11]",61750
3,2000,1,"Arts, entertainment and recreation [71]",39500
4,2000,1,Construction [23],106250
...,...,...,...,...
119179,1999,12,9111,2250
119180,1999,12,9120,28500
119181,1999,12,9130,30250
119182,1999,12,9141,500


In [3]:
data.NAICS.value_counts()

Social assistance[624]                      276
3344                                        276
3346                                        276
3351                                        276
3352                                        276
                                           ... 
1131                                        180
3161                                        168
5612                                        108
5211                                         36
Monetary authorities - central bank[521]     36
Name: NAICS, Length: 437, dtype: int64

In [4]:
#data[['SYEAR', 'SMTH']] = data[['SYEAR', 'SMTH']].astype(object) 
#print(data.dtypes) 

## Getting *Postal service, couriers and messengers* from Postal service[491] & Couriers and messengers[492]

In [5]:
Couriers_and_messengers = data.loc[data['NAICS'].str.contains('492', na = False)]
Couriers_and_messengers

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5489,2000,1,Couriers and messengers[492],10000
5591,2000,2,Couriers and messengers[492],9250
5693,2000,3,Couriers and messengers[492],9750
5795,2000,4,Couriers and messengers[492],7250
5897,2000,5,Couriers and messengers[492],7250
...,...,...,...,...
33114,1999,8,Couriers and messengers[492],9500
33217,1999,9,Couriers and messengers[492],11000
33320,1999,10,Couriers and messengers[492],12750
33423,1999,11,Couriers and messengers[492],10000


In [6]:
Postal_service = data.loc[data['NAICS'].str.contains('491', na = False)]
Postal_service

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5543,2000,1,Postal service[491],9750
5645,2000,2,Postal service[491],8750
5747,2000,3,Postal service[491],8250
5849,2000,4,Postal service[491],7500
5951,2000,5,Postal service[491],9750
...,...,...,...,...
33169,1999,8,Postal service[491],6750
33272,1999,9,Postal service[491],7500
33375,1999,10,Postal service[491],8000
33478,1999,11,Postal service[491],6750


In [7]:
Postal_service_couriers_and_messengers = pd.concat([Postal_service, Couriers_and_messengers])
Postal_service_couriers_and_messengers

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5543,2000,1,Postal service[491],9750
5645,2000,2,Postal service[491],8750
5747,2000,3,Postal service[491],8250
5849,2000,4,Postal service[491],7500
5951,2000,5,Postal service[491],9750
...,...,...,...,...
33114,1999,8,Couriers and messengers[492],9500
33217,1999,9,Couriers and messengers[492],11000
33320,1999,10,Couriers and messengers[492],12750
33423,1999,11,Couriers and messengers[492],10000


### Changing all names of the NACIS column to *Postal service, couriers and messengers*

After changing all column names to *Postal service, couriers and messengers*, the dataframe was sorted by the **SYEAR and SMTH** columns. Then, the *Employment* column was segregated to add up consecutives even rows to get the sum of empoyments from both [491] and [492].

After that, the dataframe was merged back and the columns names were changed to reflect *Data Output Template*.

In [8]:
Postal_service_couriers_and_messengers = Postal_service_couriers_and_messengers.replace(
    'Postal service[491]', 'Postal service, couriers and messengers')
Postal_service_couriers_and_messengers = Postal_service_couriers_and_messengers.replace(
    'Couriers and messengers[492]', 'Postal service, couriers and messengers')
Postal_service_couriers_and_messengers

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5543,2000,1,"Postal service, couriers and messengers",9750
5645,2000,2,"Postal service, couriers and messengers",8750
5747,2000,3,"Postal service, couriers and messengers",8250
5849,2000,4,"Postal service, couriers and messengers",7500
5951,2000,5,"Postal service, couriers and messengers",9750
...,...,...,...,...
33114,1999,8,"Postal service, couriers and messengers",9500
33217,1999,9,"Postal service, couriers and messengers",11000
33320,1999,10,"Postal service, couriers and messengers",12750
33423,1999,11,"Postal service, couriers and messengers",10000


In [9]:
df1 = Postal_service_couriers_and_messengers.sort_values(['SYEAR', 'SMTH'])
df1.head(50)

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
29976,1997,1,"Postal service, couriers and messengers",8000
29921,1997,1,"Postal service, couriers and messengers",8250
30079,1997,2,"Postal service, couriers and messengers",8000
30024,1997,2,"Postal service, couriers and messengers",9000
30182,1997,3,"Postal service, couriers and messengers",7750
30127,1997,3,"Postal service, couriers and messengers",7500
30285,1997,4,"Postal service, couriers and messengers",8500
30230,1997,4,"Postal service, couriers and messengers",6000
30388,1997,5,"Postal service, couriers and messengers",7500
30333,1997,5,"Postal service, couriers and messengers",7500


In [10]:
df = df1.drop(['SYEAR', 'SMTH', 'NAICS'], axis = 1)
df

Unnamed: 0,_EMPLOYMENT_
29976,8000
29921,8250
30079,8000
30024,9000
30182,7750
...,...
29618,0
29772,0
29719,0
29873,0


In [11]:
df.index = np.arange(1,len(df)+1)
df = df.reset_index()
df

Unnamed: 0,index,_EMPLOYMENT_
0,1,8000
1,2,8250
2,3,8000
3,4,9000
4,5,7750
...,...,...
547,548,0
548,549,0
549,550,0
550,551,0


In [12]:
df = df.set_index('index')
df_odd = df.loc[df.index.values % 2 == 1]
df_even = df.loc[df.index.values % 2 == 0]
df_even = df_even.set_index(df_even.index.values - 1)
new = df_odd.add(df_even, fill_value = 0)
new = new.reset_index().reset_index()
new

Unnamed: 0,level_0,index,_EMPLOYMENT_
0,0,1,16250
1,1,3,17000
2,2,5,15250
3,3,7,14500
4,4,9,15000
...,...,...,...
271,271,543,21250
272,272,545,19000
273,273,547,0
274,274,549,0


In [13]:
df2 = df1[np.arange(len(df)) % 2 == 0]
df2 = df2.reset_index().reset_index()
df2

Unnamed: 0,level_0,index,SYEAR,SMTH,NAICS,_EMPLOYMENT_
0,0,29976,1997,1,"Postal service, couriers and messengers",8000
1,1,30079,1997,2,"Postal service, couriers and messengers",8000
2,2,30182,1997,3,"Postal service, couriers and messengers",7750
3,3,30285,1997,4,"Postal service, couriers and messengers",8500
4,4,30388,1997,5,"Postal service, couriers and messengers",7500
...,...,...,...,...,...,...
271,271,29469,2019,8,"Postal service, couriers and messengers",9750
272,272,29570,2019,9,"Postal service, couriers and messengers",9750
273,273,29671,2019,10,"Postal service, couriers and messengers",0
274,274,29772,2019,11,"Postal service, couriers and messengers",0


In [14]:
df3 = pd.merge(df2, new, how = 'inner', on = 'level_0')
df3 = df3.drop(['level_0', 'index_x', '_EMPLOYMENT__x', 'index_y'], axis = 1)
df3.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Postal_service_couriers_and_messengers   = df3

Postal_service_couriers_and_messengers = Postal_service_couriers_and_messengers[(Postal_service_couriers_and_messengers.SYEAR != 2018) & 
                                                 (Postal_service_couriers_and_messengers.SYEAR != 2019)]

Postal_service_couriers_and_messengers   

Unnamed: 0,SYEAR,SMTH,LMO_Detailed_Industry,Employment
0,1997,1,"Postal service, couriers and messengers",16250
1,1997,2,"Postal service, couriers and messengers",17000
2,1997,3,"Postal service, couriers and messengers",15250
3,1997,4,"Postal service, couriers and messengers",14500
4,1997,5,"Postal service, couriers and messengers",15000
...,...,...,...,...
247,2017,8,"Postal service, couriers and messengers",15250
248,2017,9,"Postal service, couriers and messengers",10000
249,2017,10,"Postal service, couriers and messengers",13500
250,2017,11,"Postal service, couriers and messengers",12250


## Getting *Farms* from Crop production[111] & Animal production and aquaculture[112]

In [15]:
Crop_Production = data.loc[data['NAICS'].str.contains('111', na = False)]
Crop_Production

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5491,2000,1,Crop production[111],11250
5593,2000,2,Crop production[111],10250
5695,2000,3,Crop production[111],13500
5797,2000,4,Crop production[111],17000
5899,2000,5,Crop production[111],21000
...,...,...,...,...
33116,1999,8,Crop production[111],20250
33219,1999,9,Crop production[111],19500
33322,1999,10,Crop production[111],17000
33425,1999,11,Crop production[111],13000


In [16]:
Animal_production_and_aquaculture = data.loc[data['NAICS'].str.contains('112', na = False)]
Animal_production_and_aquaculture

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5478,2000,1,Animal production and aquaculture[112],12250
5580,2000,2,Animal production and aquaculture[112],12000
5682,2000,3,Animal production and aquaculture[112],10500
5784,2000,4,Animal production and aquaculture[112],7500
5886,2000,5,Animal production and aquaculture[112],8500
...,...,...,...,...
33103,1999,8,Animal production and aquaculture[112],11500
33206,1999,9,Animal production and aquaculture[112],11500
33309,1999,10,Animal production and aquaculture[112],11500
33412,1999,11,Animal production and aquaculture[112],14000


In [17]:
Farms = pd.concat([Crop_Production, Animal_production_and_aquaculture])
Farms

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5491,2000,1,Crop production[111],11250
5593,2000,2,Crop production[111],10250
5695,2000,3,Crop production[111],13500
5797,2000,4,Crop production[111],17000
5899,2000,5,Crop production[111],21000
...,...,...,...,...
33103,1999,8,Animal production and aquaculture[112],11500
33206,1999,9,Animal production and aquaculture[112],11500
33309,1999,10,Animal production and aquaculture[112],11500
33412,1999,11,Animal production and aquaculture[112],14000


### Changing all names of the NACIS column to *Farms*

After changing all column names to *Farms*, the dataframe was sorted by the **SYEAR and SMTH** columns. Then, the *Employment* column was segregated to add up consecutives even rows to get the sum of empoyments from both [111] and [112].

After that, the dataframe was merged back and the columns names were changed to reflect *Data Output Template*.

In [18]:
Farms = Farms.replace('Crop production[111]', 'Farms')
Farms = Farms.replace('Animal production and aquaculture[112]', 'Farms')
Farms

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
5491,2000,1,Farms,11250
5593,2000,2,Farms,10250
5695,2000,3,Farms,13500
5797,2000,4,Farms,17000
5899,2000,5,Farms,21000
...,...,...,...,...
33103,1999,8,Farms,11500
33206,1999,9,Farms,11500
33309,1999,10,Farms,11500
33412,1999,11,Farms,14000


In [19]:
df1 = Farms.sort_values(['SYEAR', 'SMTH'])
df1.head(50)

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
29923,1997,1,Farms,8750
29910,1997,1,Farms,13500
30026,1997,2,Farms,9750
30013,1997,2,Farms,14000
30129,1997,3,Farms,10500
30116,1997,3,Farms,13250
30232,1997,4,Farms,10500
30219,1997,4,Farms,15000
30335,1997,5,Farms,12000
30322,1997,5,Farms,15500


### Executing all aforementioned steps in one cell 

In [21]:
df = df1.drop(['SYEAR', 'SMTH', 'NAICS'], axis = 1)
df

df.index = np.arange(1,len(df)+1)
df = df.reset_index()
df

df = df.set_index('index')
df_odd = df.loc[df.index.values % 2 == 1]
df_even = df.loc[df.index.values % 2 == 0]
df_even = df_even.set_index(df_even.index.values - 1)
new = df_odd.add(df_even, fill_value = 0)
new = new.reset_index().reset_index()
new

df2 = df1[np.arange(len(df)) % 2 == 0]
df2 = df2.reset_index().reset_index()
df2

df3 = pd.merge(df2, new, how = 'inner', on = 'level_0')
df3 = df3.drop(['level_0', 'index_x', '_EMPLOYMENT__x', 'index_y'], axis = 1)
df3.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Farms = df3

Farms = Farms[(Farms.SYEAR != 2018) & (Farms.SYEAR != 2019)]

Farms

Unnamed: 0,SYEAR,SMTH,LMO_Detailed_Industry,Employment
0,1997,1,Farms,22250
1,1997,2,Farms,23750
2,1997,3,Farms,23750
3,1997,4,Farms,25500
4,1997,5,Farms,27500
...,...,...,...,...
247,2017,8,Farms,25500
248,2017,9,Farms,19750
249,2017,10,Farms,20750
250,2017,11,Farms,19500


## Getting *Food, beverage and tobacco manufacturing* from Food manufacturing[311] & Beverage and tobacco product manufacturing[312]

In [None]:
Food_manufacturing = data.loc[data['NAICS'].str.contains('311', na = False)]
Food_manufacturing

In [None]:
Beverage_and_tobacco_product_manufacturing = data.loc[data['NAICS'].str.contains('312', na = False)]
Beverage_and_tobacco_product_manufacturing

In [None]:
Food_beverage_and_tobacco_manufacturing = pd.concat([Food_manufacturing, Beverage_and_tobacco_product_manufacturing])
Food_beverage_and_tobacco_manufacturing

### Changing all names of the NACIS column to *Food, beverage and tobacco manufacturing*

After changing all column names to *Food, beverage and tobacco manufacturing*, the dataframe was sorted by the **SYEAR and SMTH** columns. Then, the *Employment* column was segregated to add up consecutives even rows to get the sum of empoyments from both [311] and [312].

After that, the dataframe was merged back and the columns names were changed to reflect *Data Output Template*.

In [None]:
Food_beverage_and_tobacco_manufacturing = Food_beverage_and_tobacco_manufacturing.replace(
    'Food manufacturing[311]', 'Food, beverage and tobacco manufacturing')
Food_beverage_and_tobacco_manufacturing = Food_beverage_and_tobacco_manufacturing.replace(
    'Beverage and tobacco product manufacturing[312]', 'Food, beverage and tobacco manufacturing')
Food_beverage_and_tobacco_manufacturing

In [None]:
df1 = Food_beverage_and_tobacco_manufacturing.sort_values(['SYEAR', 'SMTH'])
df1.tail(50)

### Executing all aforementioned steps in one cell 

In [23]:
df = df1.drop(['SYEAR', 'SMTH', 'NAICS'], axis = 1)
df

df.index = np.arange(1,len(df)+1)
df = df.reset_index()
df

df = df.set_index('index')
df_odd = df.loc[df.index.values % 2 == 1]
df_even = df.loc[df.index.values % 2 == 0]
df_even = df_even.set_index(df_even.index.values - 1)
new = df_odd.add(df_even, fill_value = 0)
new = new.reset_index().reset_index()
new

df2 = df1[np.arange(len(df)) % 2 == 0]
df2 = df2.reset_index().reset_index()
df2

df3 = pd.merge(df2, new, how = 'inner', on = 'level_0')
df3 = df3.drop(['level_0', 'index_x', '_EMPLOYMENT__x', 'index_y'], axis = 1)
df3.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Food_beverage_and_tobacco_manufacturing = df3

Food_beverage_and_tobacco_manufacturing =Food_beverage_and_tobacco_manufacturing[
    (Food_beverage_and_tobacco_manufacturing.SYEAR != 2018) & (Food_beverage_and_tobacco_manufacturing.SYEAR != 2019)]

Food_beverage_and_tobacco_manufacturing

Unnamed: 0,SYEAR,SMTH,LMO_Detailed_Industry,Employment
0,1997,1,Farms,22250
1,1997,2,Farms,23750
2,1997,3,Farms,23750
3,1997,4,Farms,25500
4,1997,5,Farms,27500
...,...,...,...,...
247,2017,8,Farms,25500
248,2017,9,Farms,19750
249,2017,10,Farms,20750
250,2017,11,Farms,19500


## Getting  *Business, building and other support services*  from Management of companies and enterprises[55] & Administrative and support, waste management and remediation services[56]

The whole process was repeated to obatin the *Business, building and other support services*. After changing all column names to *Business, building and other support services*, the dataframe was sorted by the **SYEAR and SMTH** columns. Then, the *Employment* column was segregated to add up consecutives even rows to get the sum of empoyments from both [55] and [56].

After that, the dataframe was merged back and the columns names were changed to reflect *Data Output Template*.

In [26]:
MCE = data[data['NAICS'] == 'Management of companies and enterprises [55]']
MCE

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
9,2000,1,Management of companies and enterprises [55],1000
29,2000,2,Management of companies and enterprises [55],1500
49,2000,3,Management of companies and enterprises [55],1500
69,2000,4,Management of companies and enterprises [55],1000
89,2000,5,Management of companies and enterprises [55],1000
...,...,...,...,...
5381,1999,8,Management of companies and enterprises [55],500
5401,1999,9,Management of companies and enterprises [55],250
5421,1999,10,Management of companies and enterprises [55],1250
5441,1999,11,Management of companies and enterprises [55],1500


In [27]:
MCE.SYEAR.value_counts().sort_values()

2015    12
2012    12
2011    12
2010    12
2009    12
2008    12
2007    12
2014    12
2013    12
2006    12
2003    12
2002    12
2001    12
2000    12
1999    12
1998    12
2005    12
2004    12
1997    12
Name: SYEAR, dtype: int64

As we can see, **Management of companies and enterprises** collected data till 2015, as such the *2016-2019* data has to be removed from **Administrative and support, waste management and remediation services** before algorithm can correctly add consecutive even rows to get the required sum. After getting the correct sum, the *2016-2019* will be added.

In [28]:
AWR = data[data['NAICS'] == 'Administrative and support, waste management and remediation services [56]']
AWR

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
1,2000,1,"Administrative and support, waste management a...",59250
21,2000,2,"Administrative and support, waste management a...",64000
41,2000,3,"Administrative and support, waste management a...",64750
61,2000,4,"Administrative and support, waste management a...",66750
81,2000,5,"Administrative and support, waste management a...",63000
...,...,...,...,...
5373,1999,8,"Administrative and support, waste management a...",68750
5393,1999,9,"Administrative and support, waste management a...",65750
5413,1999,10,"Administrative and support, waste management a...",59000
5433,1999,11,"Administrative and support, waste management a...",62250


In [29]:
AWR_reduced = AWR[(data['SYEAR'] != 2016) & (data['SYEAR'] != 2017) & (data['SYEAR'] != 2018) & (data['SYEAR'] != 2019)]
AWR_reduced

  """Entry point for launching an IPython kernel.


Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
1,2000,1,"Administrative and support, waste management a...",59250
21,2000,2,"Administrative and support, waste management a...",64000
41,2000,3,"Administrative and support, waste management a...",64750
61,2000,4,"Administrative and support, waste management a...",66750
81,2000,5,"Administrative and support, waste management a...",63000
...,...,...,...,...
5373,1999,8,"Administrative and support, waste management a...",68750
5393,1999,9,"Administrative and support, waste management a...",65750
5413,1999,10,"Administrative and support, waste management a...",59000
5433,1999,11,"Administrative and support, waste management a...",62250


In [30]:
AWR_removed_data = AWR[(data['SYEAR'] == 2016) | (data['SYEAR'] == 2017) | (data['SYEAR'] == 2018) | (data['SYEAR'] == 2019)]
AWR_removed_data

  """Entry point for launching an IPython kernel.


Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
3841,2016,1,"Administrative and support, waste management a...",92250
3860,2016,2,"Administrative and support, waste management a...",92250
3879,2016,3,"Administrative and support, waste management a...",92750
3898,2016,4,"Administrative and support, waste management a...",100750
3917,2016,5,"Administrative and support, waste management a...",104750
3936,2016,6,"Administrative and support, waste management a...",113500
3955,2016,7,"Administrative and support, waste management a...",115750
3974,2016,8,"Administrative and support, waste management a...",122250
3993,2016,9,"Administrative and support, waste management a...",114000
4012,2016,10,"Administrative and support, waste management a...",109750


In [31]:
Business_building_and_other_support_services = pd.concat([MCE, AWR_reduced])
Business_building_and_other_support_services

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
9,2000,1,Management of companies and enterprises [55],1000
29,2000,2,Management of companies and enterprises [55],1500
49,2000,3,Management of companies and enterprises [55],1500
69,2000,4,Management of companies and enterprises [55],1000
89,2000,5,Management of companies and enterprises [55],1000
...,...,...,...,...
5373,1999,8,"Administrative and support, waste management a...",68750
5393,1999,9,"Administrative and support, waste management a...",65750
5413,1999,10,"Administrative and support, waste management a...",59000
5433,1999,11,"Administrative and support, waste management a...",62250


In [32]:
Business_building_and_other_support_services = Business_building_and_other_support_services.replace(
    'Management of companies and enterprises [55]', 'Business, building and other support services')
Business_building_and_other_support_services = Business_building_and_other_support_services.replace(
    'Administrative and support, waste management and remediation services [56]', 'Business, building and other support services')
Business_building_and_other_support_services

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
9,2000,1,"Business, building and other support services",1000
29,2000,2,"Business, building and other support services",1500
49,2000,3,"Business, building and other support services",1500
69,2000,4,"Business, building and other support services",1000
89,2000,5,"Business, building and other support services",1000
...,...,...,...,...
5373,1999,8,"Business, building and other support services",68750
5393,1999,9,"Business, building and other support services",65750
5413,1999,10,"Business, building and other support services",59000
5433,1999,11,"Business, building and other support services",62250


In [33]:
df1 = Business_building_and_other_support_services.sort_values(['SYEAR', 'SMTH'])
df1.head(50)

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
4761,1997,1,"Business, building and other support services",1750
4753,1997,1,"Business, building and other support services",58500
4781,1997,2,"Business, building and other support services",500
4773,1997,2,"Business, building and other support services",60250
4801,1997,3,"Business, building and other support services",1000
4793,1997,3,"Business, building and other support services",55500
4821,1997,4,"Business, building and other support services",500
4813,1997,4,"Business, building and other support services",61000
4841,1997,5,"Business, building and other support services",500
4833,1997,5,"Business, building and other support services",60500


### Executing all aforementioned steps in one cell 

In [34]:
df = df1.drop(['SYEAR', 'SMTH', 'NAICS'], axis = 1)
df

df.index = np.arange(1,len(df)+1)
df = df.reset_index()
df

df = df.set_index('index')
df_odd = df.loc[df.index.values % 2 == 1]
df_even = df.loc[df.index.values % 2 == 0]
df_even = df_even.set_index(df_even.index.values - 1)
new = df_odd.add(df_even, fill_value = 0)
new = new.reset_index().reset_index()
new

df2 = df1[np.arange(len(df)) % 2 == 0]
df2 = df2.reset_index().reset_index()
df2

df3 = pd.merge(df2, new, how = 'inner', on = 'level_0')
df3 = df3.drop(['level_0', 'index_x', '_EMPLOYMENT__x', 'index_y'], axis = 1)
df3.columns = ['SYEAR', 'SMTH', 'NAICS', '_EMPLOYMENT_']
Business_building_and_other_support_services = df3

Business_building_and_other_support_services = Business_building_and_other_support_services[
    (Business_building_and_other_support_services.SYEAR != 2018) & (Business_building_and_other_support_services.SYEAR != 2019)]

Business_building_and_other_support_services

Unnamed: 0,SYEAR,SMTH,NAICS,_EMPLOYMENT_
0,1997,1,"Business, building and other support services",60250
1,1997,2,"Business, building and other support services",60750
2,1997,3,"Business, building and other support services",56500
3,1997,4,"Business, building and other support services",61500
4,1997,5,"Business, building and other support services",61000
...,...,...,...,...
223,2015,8,"Business, building and other support services",104750
224,2015,9,"Business, building and other support services",95750
225,2015,10,"Business, building and other support services",95250
226,2015,11,"Business, building and other support services",97000


Adding removed data and renaming the coulmns now

In [25]:
Business_building_and_other_support_services = pd.concat([Business_building_and_other_support_services, AWR_removed_data])
Business_building_and_other_support_services.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Business_building_and_other_support_services

NameError: name 'AWR_removed_data' is not defined

## Getting  *Local and Indigenous public administration*  from Local, municipal and regional public administration[913], Aboriginal public administration[914] & International and other extra-territorial public administration[919]

The whole process was repeated to obatin the *Local and Indigenous public administration*. After changing all column names to *Business, building and other support services*, the dataframe was sorted by the **SYEAR and SMTH** columns. Then, the *Employment* column was segregated to add up three consecutives rows to get the sum of empoyments from [913], [914] and [919].

After that, the dataframe was merged back and the columns names were changed to reflect *Data Output Template*.

In [None]:
LMRPA = data.loc[data['NAICS'].str.contains('913', na = False)]
LMRPA

In [None]:
Aboriginal_public_administration = data.loc[data['NAICS'].str.contains('914', na = False)]
Aboriginal_public_administration

In [None]:
IETPA = data.loc[data['NAICS'].str.contains('919', na = False)]
IETPA

In [None]:
Local_and_Indigenous_public_administration = pd.concat([LMRPA, Aboriginal_public_administration, IETPA])
Local_and_Indigenous_public_administration

In [None]:
Local_and_Indigenous_public_administration = Local_and_Indigenous_public_administration.replace(
    'Local, municipal and regional public administration[913]', 'Local and Indigenous public administration')
Local_and_Indigenous_public_administration = Local_and_Indigenous_public_administration.replace(
    'Aboriginal public administration[914]','Local and Indigenous public administration')
Local_and_Indigenous_public_administration = Local_and_Indigenous_public_administration.replace(
    'International and other extra-territorial public administration[919]', 'Local and Indigenous public administration')
Local_and_Indigenous_public_administration

In [None]:
df1 = Local_and_Indigenous_public_administration.sort_values(['SYEAR', 'SMTH'])
df1.head(50)

### Executing the rest of the aforementioned steps in the subsequent cells

In [None]:
df = df1.drop(['SYEAR', 'SMTH', 'NAICS'], axis = 1)
df

df.index = np.arange(1,len(df)+1)
df = df.reset_index()

df = df.set_index('index')
df_first = df.loc[df.index.values % 3 == 1].reset_index()
df_second = df.loc[df.index.values % 3 == 2].reset_index()
df_third = df.loc[df.index.values % 3 == 0].reset_index()

In [None]:
new  = pd.merge(df_first, df_second, left_index=True, right_index=True)
new  = pd.merge(new, df_third, left_index=True, right_index=True)
new['EMPLOYMENT'] = new['_EMPLOYMENT__x'] + new['_EMPLOYMENT__y'] + new['_EMPLOYMENT_']
new = new['EMPLOYMENT']
new.to_frame()

In [None]:
df2 = df1[np.arange(len(df)) % 3 == 0]
df2 = df2.reset_index().reset_index()
df2

In [None]:
df3 = pd.merge(df2, new, left_index=True, right_index=True)
df3 = df3.drop(['level_0', 'index', '_EMPLOYMENT_'], axis = 1)
df3.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Local_and_Indigenous_public_administration = df3
Local_and_Indigenous_public_administration.head(30)

## Getting  *Broadcasting, data processing, and information*  from 	Broadcasting (except Internet)[515], Data processing, hosting, and related services[518] & International and other Other information services[519]

A new method was applied to obatin the *Broadcasting, data processing, and information*. The three extracted dataframes were merged on their indexes. Then, all the *Employment* columns were added to get the sum of empoyments from [515], [518] and [519]. 

After that, the other columns were removed, the NAICS column name was cahnged to *Broadcasting, data processing, and information* and the columns names were changed to reflect *Data Output Template*.

In [None]:
Broadcasting_exceptInternet = data.loc[data['NAICS'].str.contains('515', na = False)].reset_index()
Broadcasting_exceptInternet

In [None]:
Data_processing_hosting = data.loc[data['NAICS'].str.contains('518', na = False)].reset_index()
Data_processing_hosting

In [None]:
other_information_services = data.loc[data['NAICS'].str.contains('519', na = False)].reset_index()
other_information_services

In [None]:
Broadcasting_data_processing_and_information = pd.merge(
    Broadcasting_exceptInternet, Data_processing_hosting, left_index=True, right_index=True)
Broadcasting_data_processing_and_information = pd.merge(
    Broadcasting_data_processing_and_information, other_information_services, left_index=True, right_index=True)
Broadcasting_data_processing_and_information

In [None]:
Broadcasting_data_processing_and_information = Broadcasting_data_processing_and_information.drop([
    'index_x', 'SYEAR_x', 'SMTH_x', 'NAICS_x', 'index_y', 'SYEAR_y', 'SMTH_y', 'NAICS_y', 'index'], axis =1)
Broadcasting_data_processing_and_information

In [None]:
Broadcasting_data_processing_and_information['EMPLOYMENT'] =(Broadcasting_data_processing_and_information['_EMPLOYMENT__x'] + 
Broadcasting_data_processing_and_information['_EMPLOYMENT__y'] + Broadcasting_data_processing_and_information['_EMPLOYMENT_'])
Broadcasting_data_processing_and_information = Broadcasting_data_processing_and_information.drop(
    ['_EMPLOYMENT__x', '_EMPLOYMENT__y', '_EMPLOYMENT_'], axis = 1)
Broadcasting_data_processing_and_information = Broadcasting_data_processing_and_information.replace(
    'Other information services[519]', 'Broadcasting data processing and information')
Broadcasting_data_processing_and_information.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Broadcasting_data_processing_and_information

## Getting  *Transit, sightseeing, and pipeline transportation*  from Transit and ground passenger transportation[485], Pipeline transportation[486] & Scenic and sightseeing transportation[487]	

The new method was applied to obatin the *Transit, sightseeing, and pipeline transportation*. The three extracted dataframes were merged on their indexes. Then, all the *Employment* columns were added to get the sum of empoyments from [485], [486] and [487]. 

After that, the other columns were removed, the NAICS column name was cahnged to *Transit, sightseeing, and pipeline transportation* and the columns names were changed to reflect *Data Output Template*.

In [None]:
Transit_and_ground_passenger_transportation = data.loc[data['NAICS'].str.contains('485', na = False)].reset_index()
Transit_and_ground_passenger_transportation

In [None]:
Pipeline_transportation = data.loc[data['NAICS'].str.contains('486', na = False)].reset_index()
Pipeline_transportation

In [None]:
Scenic_and_sightseeing_transportation = data.loc[data['NAICS'].str.contains('487', na = False)].reset_index()
Scenic_and_sightseeing_transportation

In [None]:
Transit_sightseeing_and_pipeline_transportation = pd.merge(
    Transit_and_ground_passenger_transportation, Pipeline_transportation, left_index=True, right_index=True)
Transit_sightseeing_and_pipeline_transportation = pd.merge(
    Transit_sightseeing_and_pipeline_transportation, Scenic_and_sightseeing_transportation, left_index=True, right_index=True)
Transit_sightseeing_and_pipeline_transportation

In [None]:
Transit_sightseeing_and_pipeline_transportation = Transit_sightseeing_and_pipeline_transportation.drop([
    'index_x', 'SYEAR_x', 'SMTH_x', 'NAICS_x', 'index_y', 'SYEAR_y', 'SMTH_y', 'NAICS_y', 'index'], axis =1)
Transit_sightseeing_and_pipeline_transportation

In [None]:
Transit_sightseeing_and_pipeline_transportation['EMPLOYMENT'] =(Transit_sightseeing_and_pipeline_transportation['_EMPLOYMENT__x'] + 
Transit_sightseeing_and_pipeline_transportation['_EMPLOYMENT__y'] + Transit_sightseeing_and_pipeline_transportation['_EMPLOYMENT_'])
Transit_sightseeing_and_pipeline_transportation = Transit_sightseeing_and_pipeline_transportation.drop(
    ['_EMPLOYMENT__x', '_EMPLOYMENT__y', '_EMPLOYMENT_'], axis = 1)
Transit_sightseeing_and_pipeline_transportation = Transit_sightseeing_and_pipeline_transportation.replace(
    'Scenic and sightseeing transportation[487]', 'Transit, sightseeing, and pipeline transportation')
Transit_sightseeing_and_pipeline_transportation.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Transit_sightseeing_and_pipeline_transportation

## Getting  *Finance*  from Monetary authorities - central bank[521], Credit intermediation and related activities[522]	, International and Blank[523] & Funds and other financial vehicles[526]	

The new method was applied to obatin the *Finance*. The four extracted dataframes were merged on their indexes. Then, all the *Employment* columns were added to get the sum of empoyments from [521], [522], [523], and [526]. **This time around [523] had no rows as proven below. Also, [521] contains data for only 1997-1999, as such a little data manipulation was done to obtain the right results.**

After that, the other columns were removed, the NAICS column name was cahnged to *Finance* and the columns names were changed to reflect *Data Output Template*.

In [None]:
Monetary_authorities_central_bank = data.loc[data['NAICS'].str.contains('521', na = False)].reset_index()
Monetary_authorities_central_bank

In [None]:
Credit_intermediation_and_related_activities = data.loc[data['NAICS'].str.contains('522', na = False)].reset_index()
Credit_intermediation_and_related_activities

In [None]:
blank_for_523 = data.loc[data['NAICS'].str.contains('523', na = False)].reset_index()
blank_for_523

In [None]:
Funds_and_other_financial_vehicles = data.loc[data['NAICS'].str.contains('526', na = False)].reset_index()
Funds_and_other_financial_vehicles.columns = ['index', 'SYEAR', 'SMTH', 'NAICS', '_EMPLOYMENT_']
Funds_and_other_financial_vehicles

In [None]:
add_on = Funds_and_other_financial_vehicles.loc[(Funds_and_other_financial_vehicles['SYEAR'] != 1997) & (Funds_and_other_financial_vehicles['SYEAR'] != 1998) & (Funds_and_other_financial_vehicles['SYEAR'] != 1999)]
add_on = add_on.drop('_EMPLOYMENT_', axis =1)
add_on['_EMPLOYMENT_'] = 0
Monetary_authorities_central_bank = pd.concat([Monetary_authorities_central_bank, add_on])
Monetary_authorities_central_bank

In [None]:
Finance = pd.merge(
   Monetary_authorities_central_bank,Credit_intermediation_and_related_activities, left_index=True, right_index=True)
Finance = pd.merge(
    Finance, Funds_and_other_financial_vehicles, left_index=True, right_index=True)
Finance

In [None]:
Finance = Finance.drop([
    'index_x', 'SYEAR_x', 'SMTH_x', 'NAICS_x', 'index_y', 'SYEAR_y', 'SMTH_y', 'NAICS_y', 'index'], axis =1)
Finance

In [None]:
Finance['EMPLOYMENT'] =(Finance['_EMPLOYMENT__x'] + 
Finance['_EMPLOYMENT__y'] + Finance['_EMPLOYMENT_'])
Finance = Finance.drop(
    ['_EMPLOYMENT__x', '_EMPLOYMENT__y', '_EMPLOYMENT_'], axis = 1)
Finance = Finance.replace(
    'Funds and other financial vehicles[526]', 'Finance')
Finance.columns = ['SYEAR', 'SMTH', 'LMO_Detailed_Industry', 'Employment']
Finance

## Getting  *Private and trades education*  from Business and secretarial schools[6114], Technical and trade schools[6115]	,  Other schools and instruction[6116] & Educational support services[6117]	

A new method was applied to obatin the *Private and trades education*. **The four extracted dataframes' __EMPLOYMENT__ were added to get the accurate results without merge methods.

In [None]:
BSS = data.loc[data['NAICS'] == 6114].reset_index()
BSS

In [None]:
TTS = data.loc[data['NAICS'] == 6115].reset_index()
TTS

In [None]:
OSI = data.loc[data['NAICS'] == 6116].reset_index()
OSI

In [None]:
ESS = data.loc[data['NAICS'] == 6117].reset_index()
ESS

In [None]:
ESS['EMPLOYMENT'] = BSS['_EMPLOYMENT_'] + OSI['_EMPLOYMENT_'] + TTS['_EMPLOYMENT_'] + ESS['_EMPLOYMENT_']
Private_and_trades_education = ESS

In [None]:
Private_and_trades_education = Private_and_trades_education.replace(6117, 'Private and trades education')
Private_and_trades_education = Private_and_trades_education.drop('_EMPLOYMENT_', axis= 1)
Private_and_trades_education = Private_and_trades_education.drop('index', 1)
Private_and_trades_education

## Getting  *Transportation equipment manufacturing (excluding shipbuilding)*  from Motor vehicle manufacturing[3361], Motor vehicle body and trailer manufacturing[3362],  Motor vehicle parts manufacturing[3363],  Aerospace product and parts manufacturing[3364], Railroad rolling stock manufacturing[3365] & Other transportation equipment manufacturing[3369]	

The new method was applied to obatin the **Transportation equipment manufacturing (excluding shipbuilding)**. **The six extracted dataframes' __EMPLOYMENT__ were added to get the accurate results without merge methods.**

In [None]:
Motor_vehicle_manufacturing = data.loc[data['NAICS'] == 3361].reset_index()
Motor_vehicle_manufacturing

In [None]:
Motor_vehicle_body_and_trailer_manufacturing = data.loc[data['NAICS'] == 3362].reset_index()
Motor_vehicle_body_and_trailer_manufacturing

In [None]:
Motor_vehicle_parts_manufacturing = data.loc[data['NAICS'] == 3363].reset_index()
Motor_vehicle_parts_manufacturing 

In [None]:
Aerospace_product_and_parts_manufacturing = data.loc[data['NAICS'] == 3364].reset_index()
Aerospace_product_and_parts_manufacturing

In [None]:
Railroad_rolling_stock_manufacturing = data.loc[data['NAICS'] == 3365].reset_index()
Railroad_rolling_stock_manufacturing

In [None]:
Other_transportation_equipment_manufacturing = data.loc[data['NAICS'] == 3369].reset_index()
Other_transportation_equipment_manufacturing

In [None]:
Other_transportation_equipment_manufacturing['EMPLOYMENT'] = Other_transportation_equipment_manufacturing['_EMPLOYMENT_'] + Railroad_rolling_stock_manufacturing['_EMPLOYMENT_'] + Aerospace_product_and_parts_manufacturing['_EMPLOYMENT_'] + Motor_vehicle_parts_manufacturing['_EMPLOYMENT_'] + Motor_vehicle_body_and_trailer_manufacturing['_EMPLOYMENT_'] + Motor_vehicle_manufacturing['_EMPLOYMENT_']
Transportation_equipment_manufacturing_excluding_shipbuilding = Other_transportation_equipment_manufacturing

In [None]:
Transportation_equipment_manufacturing_excluding_shipbuilding = Transportation_equipment_manufacturing_excluding_shipbuilding.replace(3369, 
                                                    'Transportation equipment manufacturing - excluding shipbuilding')
Transportation_equipment_manufacturing_excluding_shipbuilding = Transportation_equipment_manufacturing_excluding_shipbuilding.drop(
    '_EMPLOYMENT_', axis= 1)
Transportation_equipment_manufacturing_excluding_shipbuilding = Transportation_equipment_manufacturing_excluding_shipbuilding.drop(
'index', 1)
Transportation_equipment_manufacturing_excluding_shipbuilding

## Getting  *Legal, accounting, design, research, and advertising services*  from  Legal services[5411], Accounting, tax preparation, bookkeeping and payroll services[5412],  Specialized design services[5414],   Scientific research and development services[5417], Advertising, public relations, and related services[5418] & Other professional, scientific and technical services[5419]	

The new method was applied to obatin the **Transportation equipment manufacturing (excluding shipbuilding)**. **The six extracted dataframes' __EMPLOYMENT__ were added to get the accurate results without merge methods.**

In [None]:
LS = data.loc[data['NAICS'] == 5411].reset_index()
LS

In [None]:
ATPBP = data.loc[data['NAICS'] == 5412].reset_index()
ATPBP

In [None]:
SDS = data.loc[data['NAICS'] == 5414].reset_index()
SDS

In [None]:
SRDS = data.loc[data['NAICS'] == 5417].reset_index()
SRDS

In [None]:
APRRS = data.loc[data['NAICS'] == 5418].reset_index()
APRRS

In [None]:
OPSTS = data.loc[data['NAICS'] == 5419].reset_index()
OPSTS

In [None]:
OPSTS['EMPLOYMENT'] = LS['_EMPLOYMENT_'] + ATPBP['_EMPLOYMENT_'] + SDS['_EMPLOYMENT_'] + SRDS['_EMPLOYMENT_'] + APRRS['_EMPLOYMENT_'] + OPSTS['_EMPLOYMENT_']
Legal_accounting_design_research_and_advertising_services = OPSTS

In [None]:
Legal_accounting_design_research_and_advertising_services = Legal_accounting_design_research_and_advertising_services.replace(5419, 
                                                    'Legal, accounting, design research and advertising services')
Legal_accounting_design_research_and_advertising_services = Legal_accounting_design_research_and_advertising_services.drop(
    '_EMPLOYMENT_', axis= 1)
Legal_accounting_design_research_and_advertising_services = Legal_accounting_design_research_and_advertising_services.drop(
'index', 1)
Legal_accounting_design_research_and_advertising_services

## Getting  *Other retail trade(excluding cars and personal care)*  from  [442], [443],  [444],  [445], [446], [447], [448],  [451],   [452], [453] & [454]	

The new method was applied to obatin the **Other retail trade(excluding cars and personal care)**. **The eleven extracted dataframes' __EMPLOYMENT__ were added to get the accurate results without merge methods.**

In [None]:
Furniture_and_home_furnishings_stores = data.loc[data['NAICS'].str.contains('442', na = False)].reset_index()
Furniture_and_home_furnishings_stores

In [None]:
Electronics_and_appliance_stores = data.loc[data['NAICS'].str.contains('443', na = False)].reset_index()
Electronics_and_appliance_stores

In [None]:
BMGESD = data.loc[data['NAICS'].str.contains('444', na = False)].reset_index()
BMGESD

In [None]:
Food_and_beverage_stores = data.loc[data['NAICS'].str.contains('445', na = False)].reset_index()
Food_and_beverage_stores

In [None]:
Health_and_personal_care_stores = data.loc[data['NAICS'].str.contains('446', na = False)].reset_index()
Health_and_personal_care_stores

In [None]:
Gasoline_stations = data.loc[data['NAICS'].str.contains('447', na = False)].reset_index()
Gasoline_stations

In [None]:
Clothing_and_clothing_accessories_stores = data.loc[data['NAICS'].str.contains('448', na = False)].reset_index()
Clothing_and_clothing_accessories_stores

In [None]:
SGHBMS = data.loc[data['NAICS'].str.contains('451', na = False)].reset_index()
SGHBMS

In [None]:
General_merchandise_stores = data.loc[data['NAICS'].str.contains('452', na = False)].reset_index()
General_merchandise_stores

In [None]:
Miscellaneous_store_retailers = data.loc[data['NAICS'].str.contains('453', na = False)].reset_index()
Miscellaneous_store_retailers

In [None]:
Non_store_retailers = data.loc[data['NAICS'].str.contains('454', na = False)].reset_index()
Non_store_retailers

In [None]:
SGHBMS['EMPLOYMENT'] = Non_store_retailers['_EMPLOYMENT_'] + Miscellaneous_store_retailers['_EMPLOYMENT_'] + General_merchandise_stores['_EMPLOYMENT_'] + SGHBMS['_EMPLOYMENT_'] + Clothing_and_clothing_accessories_stores['_EMPLOYMENT_'] + Gasoline_stations['_EMPLOYMENT_'] + Health_and_personal_care_stores['_EMPLOYMENT_'] + Food_and_beverage_stores['_EMPLOYMENT_'] + BMGESD['_EMPLOYMENT_'] + Electronics_and_appliance_stores['_EMPLOYMENT_'] + Furniture_and_home_furnishings_stores['_EMPLOYMENT_']
Other_retail_trade = SGHBMS
Other_retail_trade

In [None]:
Other_retail_trade = Other_retail_trade.replace('Sporting goods, hobby, book and music stores[451]', 
                                                    'Other retail trade(excluding cars and personal care)')
Other_retail_trade = Other_retail_trade.drop(['_EMPLOYMENT_', 'index'], axis= 1)
Other_retail_trade

## Getting  *Other manufacturing*  from [313], [314], [315], [316], [323], [324], [325], [326], [327], [334], [335], [337] & [339]

In [None]:
Textile_mills = data.loc[data['NAICS'].str.contains('313', na = False)].reset_index()
Textile_mills

In [None]:
Textile_product_mills = data.loc[data['NAICS'].str.contains('314', na = False)].reset_index()
Textile_product_mills

In [None]:
Clothing_manufacturing = data.loc[data['NAICS'].str.contains('315', na = False)].reset_index()
Clothing_manufacturing

In [None]:
Leather_and_allied_product_manufacturing = data.loc[data['NAICS'].str.contains('316', na = False)].reset_index()
Leather_and_allied_product_manufacturing

In [None]:
Printing_and_related_support_activities = data.loc[data['NAICS'].str.contains('323', na = False)].reset_index()
Printing_and_related_support_activities

In [None]:
Petroleum_and_coal_product_manufacturing = data.loc[data['NAICS'].str.contains('324', na = False)].reset_index()
Petroleum_and_coal_product_manufacturing

In [None]:
Chemical_manufacturing = data.loc[data['NAICS'].str.contains('325', na = False)].reset_index()
Chemical_manufacturing

In [None]:
Plastics_and_rubber_products_manufacturing = data.loc[data['NAICS'].str.contains('326', na = False)].reset_index()
Plastics_and_rubber_products_manufacturing

In [None]:
Non_metallic_mineral_product_manufacturing = data.loc[data['NAICS'].str.contains('327', na = False)].reset_index()
Non_metallic_mineral_product_manufacturing

In [None]:
Computer_and_electronic_product_manufacturing = data.loc[data['NAICS'].str.contains('334', na = False)].reset_index()
Computer_and_electronic_product_manufacturing

In [None]:
EAC = data.loc[data['NAICS'].str.contains('335', na = False)].reset_index()
EAC

In [None]:
Furniture_and_related_product_manufacturing = data.loc[data['NAICS'].str.contains('337', na = False)].reset_index()
Furniture_and_related_product_manufacturing

In [None]:
Miscellaneous_manufacturing = data.loc[data['NAICS'].str.contains('339', na = False)].reset_index()
Miscellaneous_manufacturing

In [None]:
EAC['EMPLOYMENT'] = Miscellaneous_manufacturing['_EMPLOYMENT_'] + Furniture_and_related_product_manufacturing['_EMPLOYMENT_'] + EAC['_EMPLOYMENT_'] + Computer_and_electronic_product_manufacturing['_EMPLOYMENT_'] + Non_metallic_mineral_product_manufacturing['_EMPLOYMENT_'] + Plastics_and_rubber_products_manufacturing['_EMPLOYMENT_'] + Textile_product_mills['_EMPLOYMENT_'] + Chemical_manufacturing['_EMPLOYMENT_'] + Petroleum_and_coal_product_manufacturing['_EMPLOYMENT_'] + Clothing_manufacturing['_EMPLOYMENT_'] + Printing_and_related_support_activities['_EMPLOYMENT_'] + Leather_and_allied_product_manufacturing['_EMPLOYMENT_'] + Textile_mills['_EMPLOYMENT_']
Other_manufacturing = EAC
Other_manufacturing

In [None]:
Other_manufacturing = Other_manufacturing.replace('Electrical equipment, appliance and component manufacturing[335]', 
                                                    'Other manufacturing')
Other_manufacturing = Other_manufacturing.drop(['_EMPLOYMENT_', 'index'], axis= 1)
Other_manufacturing

## Extracting the single NAICS number industries

In [None]:
Fishing_hunting_and_trapping = data.loc[data['NAICS'].str.contains('114', na = False)]
Fishing_hunting_and_trapping

In [None]:
Forestry_and_logging = data.loc[data['NAICS'].str.contains('113', na = False)]
Forestry_and_logging

In [None]:
SPAF = data.loc[data['NAICS'].str.contains('115', na = False)]
SPAF

In [None]:
Oil_and_gas_extraction = data.loc[data['NAICS'].str.contains('211', na = False)]
Oil_and_gas_extraction

In [None]:
SAMOG = data.loc[data['NAICS'].str.contains('213', na = False)]
SAMOG

In [None]:
Mining_and_quarrying  = data.loc[data['NAICS'].str.contains('212', na = False)]
Mining_and_quarrying

In [None]:
Wood_product_manufacturing = data.loc[data['NAICS'].str.contains('321', na = False)]
Wood_product_manufacturing

In [None]:
Paper_manufacturing = data.loc[data['NAICS'].str.contains('322', na = False)]
Paper_manufacturing

In [None]:
Primary_metal_manufacturing = data.loc[data['NAICS'].str.contains('331', na = False)]
Primary_metal_manufacturing

In [None]:
Fabricated_metal_product_manufacturing = data.loc[data['NAICS'].str.contains('332', na = False)]
Fabricated_metal_product_manufacturing

In [None]:
Machinery_manufacturing = data.loc[data['NAICS'].str.contains('333', na = False)]
Machinery_manufacturing

In [None]:
Motor_vehicle_and_parts_dealers = data.loc[data['NAICS'].str.contains('441', na = False)]
Motor_vehicle_and_parts_dealers

In [None]:
Health_and_personal_care_stores = data.loc[data['NAICS'].str.contains('446', na = False)]
Health_and_personal_care_stores

In [None]:
Air_transportation = data.loc[data['NAICS'].str.contains('481', na = False)]
Air_transportation

In [None]:
Rail_transportation = data.loc[data['NAICS'].str.contains('482', na = False)]
Rail_transportation

In [None]:
Water_transportation = data.loc[data['NAICS'].str.contains('483', na = False)]
Water_transportation

In [None]:
Truck_transportation = data.loc[data['NAICS'].str.contains('484', na = False)]
Truck_transportation

In [None]:
Support_activities_for_transportation = data.loc[data['NAICS'].str.contains('488', na = False)]
Support_activities_for_transportation

In [None]:
Warehousing_and_storage = data.loc[data['NAICS'].str.contains('493', na = False)]
Warehousing_and_storage

In [None]:
Insurance_carriers_and_related_activities = data.loc[data['NAICS'].str.contains('524', na = False)]
Insurance_carriers_and_related_activities

In [None]:
Ambulatory_health_care_services = data.loc[data['NAICS'].str.contains('621', na = False)]
Ambulatory_health_care_services

In [None]:
Hospitals = data.loc[data['NAICS'].str.contains('622', na = False)]
Hospitals

In [None]:
Truck_transportation = data.loc[data['NAICS'].str.contains('623', na = False)]
Truck_transportation

In [None]:
Social_assistance = data.loc[data['NAICS'].str.contains('624', na = False)]
Social_assistance

In [None]:
Publishing_industries = data.loc[data['NAICS'].str.contains('511', na = False)]
Publishing_industries

In [None]:
MPSRI = data.loc[data['NAICS'].str.contains('512', na = False)]
MPSRI

In [None]:
Telecommunications = data.loc[data['NAICS'].str.contains('517', na = False)]
Telecommunications

In [None]:
PSSR = data.loc[data['NAICS'].str.contains('711', na = False)]
PSSR

In [None]:
AGRI = data.loc[data['NAICS'].str.contains('713', na = False)]
AGRI

In [None]:
Heritage_institutions = data.loc[data['NAICS'].str.contains('712', na = False)]
Heritage_institutions

In [None]:
Food_services_and_drinking_places = data.loc[data['NAICS'].str.contains('722', na = False)]
Food_services_and_drinking_places

In [None]:
Federal_government_public_administration = data.loc[data['NAICS'].str.contains('911', na = False)]
Federal_government_public_administration

In [None]:
PTPA = data.loc[data['NAICS'].str.contains('912', na = False)]
PTPA

In [None]:
Utilities_22 = data.loc[data['NAICS'].str.contains('Utilities', na = False)]
Utilities_22 = Utilities_22[:276]
Utilities_22

In [None]:
Construction = data.loc[data['NAICS'].str.contains('23', na = False)]
Construction = Construction[:276]
Construction

In [None]:
Wholesale_trade = data.loc[data['NAICS'].str.contains('41', na = False)]
Wholesale_trade = Wholesale_trade[:276]
Wholesale_trade

In [None]:
Real_estate_rental_and_leasing = data.loc[data['NAICS'].str.contains('53', na = False)]
Real_estate_rental_and_leasing = Real_estate_rental_and_leasing[:276]
Real_estate_rental_and_leasing

In [None]:
Other_services = data.loc[data['NAICS'].str.contains('81', na = False)]
Other_services = Other_services[:276]
Other_services

In [None]:
Ship_and_boat_building = data.loc[data['NAICS'] == 3366]
Ship_and_boat_building = Ship_and_boat_building.replace(3366, 'Ship and boat building')
Ship_and_boat_building

In [None]:
AERS = data.loc[data['NAICS'] == 5413]
AERS = AERS.replace(5413, 'Architectural, engineering and related services')
AERS

In [None]:
CSDRS = data.loc[data['NAICS'] == 5415]
CSDRS = CSDRS.replace(5415, 'Computer systems design and related services')
CSDRS

In [None]:
MSTCS = data.loc[data['NAICS'] == 5416]
MSTCS = MSTCS.replace(5416, 'Management, scientific and technical consulting services')
MSTCS

In [None]:
Elementary_and_secondary_schools = data.loc[data['NAICS'] == 6111]
Elementary_and_secondary_schools = Elementary_and_secondary_schools.replace(6111, 'Elementary and secondary schools')
Elementary_and_secondary_schools

In [None]:
CCC = data.loc[data['NAICS'] == 6112]
CCC = CCC.replace(6112, 'Community colleges and C.E.G.E.P.s')
CCC

In [None]:
Universities = data.loc[data['NAICS'] == 6113]
Universities = Universities.replace(6113, 'Universities')
Universities

# EDA

In [None]:
plt.figure(figsize=(18,10))
plt.plot(Construction['SYEAR'], Construction['_EMPLOYMENT_'], color = 'r',marker='o', 
         linestyle='dashed', linewidth=2, markersize=2)

plt.xlabel('Year', fontsize=18)
plt.xticks(rotation=40, fontsize = 14)
plt.ylabel('Construction', fontsize=18)
plt.yticks(fontsize = 14)
plt.title('Evolvement of employment in Construction overtime', fontsize=22)
red = patches.Patch(color='red', label='Construction')
plt.legend(handles=[red], prop = {'size':15})