# Modelar los datos  

Para poder visualizar y documentar tienen que realizar los siguientes pasos de modelado primero: 

1.  Extraer la base de datos “Ask A Manager Salary Survey 2021” de AskAManager.org https://docs.google.com/spreadsheets/d/1IPS5dBSGtwYVbjsfbaMCYIWnOuRmJcbequohNxCyGVw/edit?resourcekey#gid=1625408792

3. Limpiar los campos de “Country” y “City” para homogenizar los nombres de los lugares. 

3. Crear 2 campos nuevos: “salario_anual” y “compensaciones” convirtiendo sueldos y compensaciones a Pesos Colombianos basados en la tasa de cambio del día que hacen el ejercicio. 

4. Crear un campo adicional sumando salario anual y compensaciones en pesos colombianos.  

(etl)=
## ETL

Proceso de extracción (E) curación, transformación (T) y carga o generación de datos a visualizar (E)

In [246]:
# python
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from inspect import cleandoc

# transform
from dataprep.clean import clean_country, clean_address
from geotext import GeoText


# typing
from typing import List, Dict, Tuple, Union, Generator, Optional
from pandas import DataFrame as PandasDF
from pandas import Series

#  warnings
import warnings
warnings.filterwarnings("ignore")

# setup
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', lambda x: f'{x:.2f}' )

In [247]:
path:str = './data'
file_name: str = 'Ask A Manager Salary Survey 2021 (Responses) - Form Responses 1.csv'
df: PandasDF = pd.read_csv(f'{path}/{file_name}')

In [248]:
cols_describe: List = df.columns.to_list()

In [249]:
cols_rename: List = cleandoc('''
timestamp
age
industry
job_title
job_describe
annual_salary
monetary_compensation
currency
not_currency_other
salary_context
country
only_us_state
city
work_xp_overall
work_xp_field
education
gender
race
''').split('\n')
# cols_rename

In [250]:
df.columns = cols_rename

In [251]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28006 entries, 0 to 28005
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   timestamp              28006 non-null  object 
 1   age                    28006 non-null  object 
 2   industry               27934 non-null  object 
 3   job_title              28005 non-null  object 
 4   job_describe           7245 non-null   object 
 5   annual_salary          28006 non-null  object 
 6   monetary_compensation  20729 non-null  float64
 7   currency               28006 non-null  object 
 8   not_currency_other     199 non-null    object 
 9   salary_context         3036 non-null   object 
 10  country                28006 non-null  object 
 11  only_us_state          23006 non-null  object 
 12  city                   27929 non-null  object 
 13  work_xp_overall        28006 non-null  object 
 14  work_xp_field          28006 non-null  object 
 15  ed

In [252]:
df.annual_salary = df.annual_salary.str.replace(',','').astype(float).astype(int).fillna(0)
#

In [253]:
df.loc[:,'monetary_compensation'] = df.loc[:,'monetary_compensation'].fillna('0').replace(',','').astype(float).astype(int).fillna(0)

In [254]:
for col in 'work_xp_overall work_xp_field'.split(' '):
    df.loc[:, col] = df.loc[:, col].str.replace(' - ','-')

In [255]:
df.country.fillna('NotAvailable', inplace=True)
ndf = clean_country(df, "country")

  0%|          | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	14513 values cleaned (51.82%)
	1143 values unable to be parsed (4.08%), set to NaN
Result contains 26863 (95.92%) values in the correct format and 1143 null values (4.08%)


In [256]:
df['country']=ndf['country_clean'].fillna('NotAvailable')
df

Unnamed: 0,timestamp,age,industry,job_title,job_describe,annual_salary,monetary_compensation,currency,not_currency_other,salary_context,country,only_us_state,city,work_xp_overall,work_xp_field,education,gender,race
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,,55000,0,USD,,,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,,54600,4000,GBP,,,United Kingdom,,Cambridge,8-10 years,5-7 years,College degree,Non-binary,White
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,,34000,0,USD,,,United States,Tennessee,Chattanooga,2-4 years,2-4 years,College degree,Woman,White
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,,62000,3000,USD,,,United States,Wisconsin,Milwaukee,8-10 years,5-7 years,College degree,Woman,White
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,,60000,7000,USD,,,United States,South Carolina,Greenville,8-10 years,5-7 years,College degree,Woman,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28001,1/3/2024 11:52:01,35-44,Health care,Senior Director of Data and Analytics,leader of analytics,172000,10320,USD,,,United States,Illinois,Chicago,21-30 years,11-20 years,Master's degree,Man,White
28002,1/12/2024 19:50:31,35-44,Engineering or Manufacturing,Quality Engineer,,105000,0,USD,,,United States,Georgia,Atlanta,11-20 years,8-10 years,Master's degree,,Another option not listed here or prefer not to answer
28003,1/28/2024 18:28:31,35-44,Business or Consulting,Data Analyst Intern,,60000,0,USD,,,United States,Massachusetts,Boston,11-20 years,8-10 years,Master's degree,Woman,Black or African American
28004,1/31/2024 15:02:32,25-34,Government and Public Administration,Customer service representative,,61000,0,USD,,,United States,Maryland,Baltimore,11-20 years,2-4 years,College degree,Man,"Hispanic, Latino, or Spanish origin, Native American or Alaska Native, White"


In [257]:
len(df.industry.unique())

1218

In [258]:
mask = (df.country=='United States').to_list()

In [259]:
df.only_us_state.fillna('NotAvailable', inplace=True)
df.loc[mask,'only_us_state'] = df.loc[mask,'only_us_state'].apply(lambda x:str(x).split(',')[0])

In [260]:
re.sub(r'd.c|dc|d.c.|dc.|washington dc','Washington', 'Washington DC'.lower())

'Washington'

In [261]:
re.sub(re.compile(r'd\.c|dc|d\.c\.|dc\.|washington dc'),'Washington', 'Washington DC'.lower().strip())

'Washington'

In [262]:
def get_cities(city):
    """"""
    pattern = r'\w+\.?\s?\w+'

    fixcity = (GeoText(str(city).title().strip()).cities+[''])[0]

    if fixcity=='':
        #fixcity = re.sub(r'd\.c|dc|d\.c\.|dc\.|washington dc','Washington', city.lower().strip())
        #fixcity = re.sub(r'ny|ny metro|ny','New York', city.lower())
        recity = (re.findall(pattern, str(city)) + [''])[0]
        fixcity = 'NotAvailable' if recity=='' else recity

    return fixcity

In [263]:
df.city.fillna('NotAvailable')
df.loc[:,'city'] = df.loc[:,'city'].fillna('NotAvailable').apply(lambda x:get_cities(x))

In [264]:
# drop columns con nulos por encima del 70
df = df.drop(df.columns[df.isna().sum()/df.shape[0]*100>70],axis=1)

In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28006 entries, 0 to 28005
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   timestamp              28006 non-null  object
 1   age                    28006 non-null  object
 2   industry               27934 non-null  object
 3   job_title              28005 non-null  object
 4   annual_salary          28006 non-null  int64 
 5   monetary_compensation  28006 non-null  int64 
 6   currency               28006 non-null  object
 7   country                28006 non-null  object
 8   only_us_state          28006 non-null  object
 9   city                   28006 non-null  object
 10  work_xp_overall        28006 non-null  object
 11  work_xp_field          28006 non-null  object
 12  education              27790 non-null  object
 13  gender                 27838 non-null  object
 14  race                   27834 non-null  object
dtypes: int64(2), object

In [266]:
df[df.columns[df.isna().sum()/df.shape[0]*100!=0]].fillna('NotAvailable')

Unnamed: 0,industry,job_title,education,gender,race
0,Education (Higher Education),Research and Instruction Librarian,Master's degree,Woman,White
1,Computing or Tech,Change & Internal Communications Manager,College degree,Non-binary,White
2,"Accounting, Banking & Finance",Marketing Specialist,College degree,Woman,White
3,Nonprofits,Program Manager,College degree,Woman,White
4,"Accounting, Banking & Finance",Accounting Manager,College degree,Woman,White
...,...,...,...,...,...
28001,Health care,Senior Director of Data and Analytics,Master's degree,Man,White
28002,Engineering or Manufacturing,Quality Engineer,Master's degree,NotAvailable,Another option not listed here or prefer not to answer
28003,Business or Consulting,Data Analyst Intern,Master's degree,Woman,Black or African American
28004,Government and Public Administration,Customer service representative,College degree,Man,"Hispanic, Latino, or Spanish origin, Native American or Alaska Native, White"


In [267]:
df.gender.fillna('Prefer not to answer', inplace=True)

In [268]:
df.race.fillna('Another option not listed here or prefer not to answer', inplace=True)

In [269]:
df.education.fillna('NotAvailable', inplace=True)

In [270]:
df.job_title.fillna('NotAvailable', inplace=True)

In [271]:
df.industry.fillna('NotAvailable', inplace=True)

In [272]:
# https://gist.githubusercontent.com/HarishChaudhari/4680482/raw/b61a5bdf5f3d5c69399f9d9e592c4896fd0dc53c/country-code-to-currency-code-mapping.csv
currency_type = pd.read_csv(f'{path}/country-code-to-currency-code-mapping.csv')
currency_type.drop(columns=['CountryCode', 'Currency'], inplace=True)
currency_type

Unnamed: 0,Country,Code
0,New Zealand,NZD
1,Cook Islands,NZD
2,Niue,NZD
3,Pitcairn,NZD
4,Tokelau,NZD
...,...,...
245,Saint Helena,GBP
246,Saint Martin (French part),ANG
247,Saint Pierre and Miquelon,EUR
248,Serbia,RSD


In [273]:
map_currency = {'Australia' if 'Australian' in k else k:v for k, v in zip(currency_type.Country.tolist(), currency_type.Code.tolist())}

In [274]:
mask = (df.currency=='Other') | (df.currency=='AUD/NZD')

In [275]:

pd.DataFrame(df.loc[mask,['country']]).replace({'country': map_currency})


Unnamed: 0,country
139,AUD
434,INR
470,AUD
603,ARS
1188,AUD
...,...
27923,TRY
27951,AUD
27959,INR
27962,AUD


In [276]:
df.loc[:, 'currency'] = df.loc[:, 'country'].map(map_currency)

In [277]:
df.loc[mask,['currency']]

Unnamed: 0,currency
139,AUD
434,INR
470,AUD
603,ARS
1188,AUD
...,...
27923,TRY
27951,AUD
27959,INR
27962,AUD


In [278]:
df.loc[:,'currency'].fillna('NotAvailable', inplace=True)

In [279]:
# https://github.com/kmamykin/askamanager_salary_survey?tab=readme-ov-file
# https://www.xe.com/currencyconverter/convert/?Amount=1&From=BDT&To=COP
# https://github.com/XenonLab/xecd-rates-client-python
convert: Dict[str, float] = dict(
  AED=1068.4036,
  AFN=53.20455,
  ARS=4.7207235,
  AUD=2561.5425,
  BAM=2162.6799,
  BDT=35.814958,
  BGN=2162.6799,
  BMD=3923.7121,
  BRL=792.1496,
  BSD=3923.7121,
  CAD=2929.1812,
  CHF=4512.6942,
  CLP=4.041755,
  CNY=545.20318,
  COP=1,
  CRC=7.5643848,
  CUP=163.50608,
  CYP=4230.1789,        # CYP replaced by EUR (Cypriot Pound - Obsolete)
  CZK=167.69382,
  DKK=567.45688,
  ECS=0.0123,           # not available
  EEK=4230.1789,        # Replace by Euro (Estonian Kroon)
  ETB=69.38377,
  EUR=4230.1789,
  GBP=4976.1942,
  GHS=318.97499,
  HKD=503.81704,
  HUF=10.934688,
  IDR=0.25083243,
  ILS=1067.405,
  INR=47.25346,
  ISK=28.514512,
  JMD=25.171052,
  JOD=5534.1497,
  JPY=26.409106,
  KES=24.658405,
  KHR=0.96428705,
  KWD=12698.431,
  KYD=4784.9943,
  LKR=12.571414,
  LTL=1225.1445,
  LVL=6019.0366,
  MAD=391.09236,
  MTL=9853.6662,
  MXN=229.63798,
  MYR=823.89182,
  NGN=2.6610031,
  NOK=369.76439,
  NZD=2416.3446,
  NotAvailable=1,
  PEN=1013.6312,
  PHP=70.226835,
  PKR=14.0435,
  PLN=978.1491,
  QAR=1077.9429,
  RON=849.69936,
  RSD=36.103858,
  RWF=3.0845636,
  SAR=1046.3232,
  SEK=376.5956,
  SGD=2914.7139,
  SLL=0.17213585,
  SOS=6.8661531,
  THB=109.2599,
  TRY=128.14431,
  TTD=577.98764,
  TWD=124.92834,
  UAH=104.31023,
  UGX=1.0298142,
  USD=3943.1921,
  UYU=100.094,
  VND=0.16079545,
  ZAR=207.87933,
  ZWD=10.841979
)

In [280]:
df.loc[:,'convert'] = df.loc[:,'currency'].map(convert).fillna(1)

In [281]:
df.loc[:,['currency', 'convert']]

Unnamed: 0,currency,convert
0,USD,3943.19
1,GBP,4976.19
2,USD,3943.19
3,USD,3943.19
4,USD,3943.19
...,...,...
28001,USD,3943.19
28002,USD,3943.19
28003,USD,3943.19
28004,USD,3943.19


In [282]:
# https://www.xe.com/currencyconverter/

In [283]:
# Jupyter book
# https://www.youtube.com/watch?v=lZ2FHTkyaMU

In [284]:
us_states_df = pd.read_csv(
    'https://raw.githubusercontent.com/petewarden/geodict/master/source_data/us_statenames.csv',
    header=None
)

In [285]:
df.head()

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,monetary_compensation,currency,country,only_us_state,city,work_xp_overall,work_xp_field,education,gender,race,convert
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,55000,0,USD,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White,3943.19
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,54600,4000,GBP,United Kingdom,NotAvailable,Cambridge,8-10 years,5-7 years,College degree,Non-binary,White,4976.19
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,34000,0,USD,United States,Tennessee,Chattanooga,2-4 years,2-4 years,College degree,Woman,White,3943.19
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,62000,3000,USD,United States,Wisconsin,Milwaukee,8-10 years,5-7 years,College degree,Woman,White,3943.19
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,60000,7000,USD,United States,South Carolina,Greenville,8-10 years,5-7 years,College degree,Woman,White,3943.19


In [286]:
df.annual_salary = df.annual_salary * df.convert
df.monetary_compensation = df.monetary_compensation * df.convert

In [287]:
df

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,monetary_compensation,currency,country,only_us_state,city,work_xp_overall,work_xp_field,education,gender,race,convert
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,216875565.50,0.00,USD,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White,3943.19
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,271700203.32,19904776.80,GBP,United Kingdom,NotAvailable,Cambridge,8-10 years,5-7 years,College degree,Non-binary,White,4976.19
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,134068531.40,0.00,USD,United States,Tennessee,Chattanooga,2-4 years,2-4 years,College degree,Woman,White,3943.19
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,244477910.20,11829576.30,USD,United States,Wisconsin,Milwaukee,8-10 years,5-7 years,College degree,Woman,White,3943.19
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,236591526.00,27602344.70,USD,United States,South Carolina,Greenville,8-10 years,5-7 years,College degree,Woman,White,3943.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28001,1/3/2024 11:52:01,35-44,Health care,Senior Director of Data and Analytics,678229041.20,40693742.47,USD,United States,Illinois,Chicago,21-30 years,11-20 years,Master's degree,Man,White,3943.19
28002,1/12/2024 19:50:31,35-44,Engineering or Manufacturing,Quality Engineer,414035170.50,0.00,USD,United States,Georgia,Atlanta,11-20 years,8-10 years,Master's degree,Prefer not to answer,Another option not listed here or prefer not to answer,3943.19
28003,1/28/2024 18:28:31,35-44,Business or Consulting,Data Analyst Intern,236591526.00,0.00,USD,United States,Massachusetts,Boston,11-20 years,8-10 years,Master's degree,Woman,Black or African American,3943.19
28004,1/31/2024 15:02:32,25-34,Government and Public Administration,Customer service representative,240534718.10,0.00,USD,United States,Maryland,Baltimore,11-20 years,2-4 years,College degree,Man,"Hispanic, Latino, or Spanish origin, Native American or Alaska Native, White",3943.19


In [288]:
df.gender.unique()

array(['Woman', 'Non-binary', 'Man', 'Prefer not to answer',
       'Other or prefer not to answer'], dtype=object)

In [289]:
df.loc[:, 'race'] = df.race.apply(lambda x: f'{x}, '.split(',')[0])

In [290]:
df

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,monetary_compensation,currency,country,only_us_state,city,work_xp_overall,work_xp_field,education,gender,race,convert
0,4/27/2021 11:02:10,25-34,Education (Higher Education),Research and Instruction Librarian,216875565.50,0.00,USD,United States,Massachusetts,Boston,5-7 years,5-7 years,Master's degree,Woman,White,3943.19
1,4/27/2021 11:02:22,25-34,Computing or Tech,Change & Internal Communications Manager,271700203.32,19904776.80,GBP,United Kingdom,NotAvailable,Cambridge,8-10 years,5-7 years,College degree,Non-binary,White,4976.19
2,4/27/2021 11:02:38,25-34,"Accounting, Banking & Finance",Marketing Specialist,134068531.40,0.00,USD,United States,Tennessee,Chattanooga,2-4 years,2-4 years,College degree,Woman,White,3943.19
3,4/27/2021 11:02:41,25-34,Nonprofits,Program Manager,244477910.20,11829576.30,USD,United States,Wisconsin,Milwaukee,8-10 years,5-7 years,College degree,Woman,White,3943.19
4,4/27/2021 11:02:42,25-34,"Accounting, Banking & Finance",Accounting Manager,236591526.00,27602344.70,USD,United States,South Carolina,Greenville,8-10 years,5-7 years,College degree,Woman,White,3943.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28001,1/3/2024 11:52:01,35-44,Health care,Senior Director of Data and Analytics,678229041.20,40693742.47,USD,United States,Illinois,Chicago,21-30 years,11-20 years,Master's degree,Man,White,3943.19
28002,1/12/2024 19:50:31,35-44,Engineering or Manufacturing,Quality Engineer,414035170.50,0.00,USD,United States,Georgia,Atlanta,11-20 years,8-10 years,Master's degree,Prefer not to answer,Another option not listed here or prefer not to answer,3943.19
28003,1/28/2024 18:28:31,35-44,Business or Consulting,Data Analyst Intern,236591526.00,0.00,USD,United States,Massachusetts,Boston,11-20 years,8-10 years,Master's degree,Woman,Black or African American,3943.19
28004,1/31/2024 15:02:32,25-34,Government and Public Administration,Customer service representative,240534718.10,0.00,USD,United States,Maryland,Baltimore,11-20 years,2-4 years,College degree,Man,Hispanic,3943.19


In [291]:
industrias = df.industry.unique().tolist()
industrias = sorted(industrias)

In [292]:
(
    df.loc[:,'industry']
)

0                Education (Higher Education)
1                           Computing or Tech
2               Accounting, Banking & Finance
3                                  Nonprofits
4               Accounting, Banking & Finance
                         ...                 
28001                             Health care
28002            Engineering or Manufacturing
28003                  Business or Consulting
28004    Government and Public Administration
28005                       Computing or Tech
Name: industry, Length: 28006, dtype: object

In [293]:
def clean_industry(industry):
    string = (industry.strip()
        .lower()
        .replace('/', ' ')
        .replace('-', ' ')
        .replace('&', '')
        .replace('(', '')
        .replace(')', '')
        .replace('"','')
        .replace("'","")
        .replace(':','')
        .replace('and', ' ')
        .replace('  ',' ')
        .replace('  ',' ')
        .replace(', ',' ')
        .replace('.','')
    )

    string = (
        ' '.join(string.split(' ')[:3])
        .replace('pharma ', 'pharmaceutical ')
        .replace('pharmaceuticals', 'pharmaceutical')
        .replace('pharmacuticals', 'pharmaceutical')
        .replace('biotechnology', 'biotech')
        .replace('biotech pharmaceutical', 'pharmaceutical biotech ')
        .replace('biotech pharma', 'pharmaceutical biotech ')
    )

    if 'pharmaceutical' in string or 'pharma' in string:
        string = 'pharmaceitical'

    if 'academi' in string or 'universit' in string or 'education' in string:
        string = 'academic'

    if 'administration' in string or 'administrative' in string:
        string = 'administration'

    if 'administrat' in string or 'office admin' in string or 'admin' in string or 'management' in string:
        string = 'administration'

    if 'architect' in string:
        string = 'architect'

    if 'animal ' in string:
        string = 'veterinary'
    
    if 'agricul' in string:
        string = 'agriculture'
    
    if 'commercial' in string:
        string = 'commercial'

    if 'human ' in string or 'recruitment' in string:
        string = 'human resources'

    if 'museum' in string:
        string = 'museum'
    
    if  string.startswith('i ') or 'my company' in string or 'freelance' in string or string.startswith('im '):
        string = 'freelance'

    if 'sciences' in string or 'scientific' in string or 'science qc' in string or 'scientist' in string:
        string = string.replace(string,'science').replace(string,'science').replace(string,'science')

    if 'religion' in string or 'religious' in string:
        string = 'religion'

    if 'rd in' == string:
        string = 'rd'

    if 'retail' in string:
        string = 'retail'

    if 'law' in string:
        string = 'legal services'

    if 'nonprofit' in string or 'non profit' in string or 'not for' in string:
        string = 'nonprofit'

    string = string.replace('politics', 'political')

    if 'political' in string:
        string = 'political'
    
    if 'science' in string:
        string = 'science'

    if 'manufacturing' in string:
        string = 'manufacturing'

    if 'mining' in string:
        string = 'mining'
    
    if 'medical' in string or 'mental health' in string or 'clinical' in string or string.endswith('hospital'):
        string = 'medical'

    if 'e comm' in string or 'digital commerce' in string or 'digital marketing' in string or 'ecommerce' in string:
        string = 'e commerce'

    string = string.replace('librarian','library').replace('libraries','library')

    if 'library' in string:
        string = 'library'

    if string.endswith('oil'):
        string = 'oil gas'
    
    if 'oil gas' in string:
        string = 'oil gas'

    if 'wholesale' in string or 'importing' in string:
        string = 'wholesale'
    
    if 'vet'==string or 'veterinary' in string or 'veterinarian' in string or string.startswith('pet'):
        string = 'veterinary'

    if 'translation' in string or 'language interpreter' in string:
        string = 'translation'

    if 'print ' in string:
        string = 'printing'

    if 'telecommunication' in string:
        string = 'telecommunication'

    if 'supply' in string or 'logitict' in string or 'transport' in string or 'procurement' in string:
        string = 'supply'

    if 'video game' in string or 'gambling' in string or 'gaming' in string or 'arts' in string:
        string = 'arts entertainment and Recreation'

    if string.startswith('it') or 'saas' in string  or 'information technology' in string or 'cybersecurity' in string or string.endswith(' it'):
        string = 'it'

    if 'trade' in string:
        string = 'trade'

    if string.startswith('research'):
        string = 'research'

    if string.startswith('l scap') or 'high end outdoor' in string:
        string = 'l scape'

    if string.startswith('market'):
        string = 'marketing'

    if 'government contract' in string:
        string = 'government contractor'

    if 'real estate' in string:
        string = 'real estate'

    if 'health' in string:
        string = 'health care'

    if 'international organization' in string:
        string = 'international organization'

    if 'professional' in string:
        string = 'professional'

    if 'training' in string:
        string = 'training'

    return string

In [294]:
#for i in industrias[300:]:
#    clean_industry(i)
    

In [295]:
df.loc[:,'industry'] = df.loc[:,'industry'].apply(clean_industry)

In [296]:
len(df.loc[:,'industry'].unique())

491

In [297]:
#df.to_csv('salary_output_clean.csv', index=False)

In [298]:
df.to_csv('industry_salary_output_clean.csv', index=False)