### Importing the relevant libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Reading data

In [4]:
raw_data = pd.read_csv('Glassdoor_DA_Jobs.csv')

In [5]:
df = raw_data.copy()
df

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,₹4L - ₹6L (Glassdoor Est.),201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,₹6L - ₹7L (Glassdoor Est.),10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD)
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,₹8L - ₹10L (Glassdoor Est.),10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD)
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,₹6L - ₹10L (Glassdoor Est.),10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD)
4,Global Planning Analyst,BP Energy,3.9,Pune,₹7L - ₹10L (Glassdoor Est.),10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD)
...,...,...,...,...,...,...,...,...,...,...,...
419,Senior Associate - Data Change Analyst,Macquarie Group Limited,3.8,Gurgaon,₹8L - ₹10L (Glassdoor Est.),10000+ Employees,1969,Company - Public,Investment & Asset Management,Finance,$5 to $10 billion (USD)
420,Senior Analyst - Data Analytics,Verint Systems Inc.,4.0,Bengaluru,₹10L (Glassdoor Est.),1001 to 5000 Employees,1994,Company - Public,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable
421,Manager - Data Analytics,1 Finance,4.9,Mumbai,₹5L - ₹6L (Glassdoor Est.),51 to 200 Employees,2022,Company - Private,Investment & Asset Management,Finance,Unknown / Non-Applicable
422,Business Analyst ( 2+ years ),Bhanzu( formerly Exploring Infinities Edtech P...,3.9,Bengaluru,₹5L - ₹8L (Employer Est.),501 to 1000 Employees,2020,Company - Private,Primary & Secondary Schools,Education,Unknown / Non-Applicable


### Going through data 

In [7]:
df.columns.array

<NumpyExtensionArray>
[       'Job_Title',     'Company_Name',          'Ratings',
         'Location', 'Salary_Estimates',     'Company_Size',
    'Founding_Year',             'Type',         'Industry',
           'Sector',          'Revenue']
Length: 11, dtype: object

In [8]:
df.isna().sum()

Job_Title           0
Company_Name        0
Ratings             0
Location            0
Salary_Estimates    0
Company_Size        0
Founding_Year       0
Type                0
Industry            0
Sector              0
Revenue             0
dtype: int64

In [9]:
df.describe(include='O')

Unnamed: 0,Job_Title,Company_Name,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue
count,424,424,424,424,424,424,424,424,424,424
unique,269,257,37,76,8,82,9,40,19,10
top,Data Analyst,BP Energy,Pune,₹7L - ₹10L (Glassdoor Est.),10000+ Employees,1908,Company - Public,Energy & Utilities,Information Technology,$10+ billion (USD)
freq,62,71,105,39,229,72,230,74,105,171


### Cleaning data
- Salary Parsing
- Age of the company
- Removing company part from Type column

### Salary Parsing
1. Removing `(Glassdoor Est.)` part from `Salary_Estimates` column
2. Remving `L` and `T` part
3. There are some columns with `per hour` values in `Salary_Estimates` column removing them
4. Creating 3 new columns (Min_Salary, Max_Salary, and Avg_Salary) columns

In [12]:
df['Salary_Estimates'] = df['Salary_Estimates'].apply(lambda x:x.split('(')[0])

In [13]:
df.head()

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,₹4L - ₹6L,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,₹6L - ₹7L,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD)
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,₹8L - ₹10L,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD)
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,₹6L - ₹10L,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD)
4,Global Planning Analyst,BP Energy,3.9,Pune,₹7L - ₹10L,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD)


In [14]:
df['Salary_Estimates'] = df['Salary_Estimates'].apply(lambda x:x.replace('L','').replace('₹','').replace('T',''))

In [15]:
df['Salary_Estimates'].unique()

array(['4 - 6 ', '6 - 7 ', '8 - 10 ', '6 - 10 ', '7 - 10 ', '5 - 8 ',
       '3 - 7 ', '6 - 9 ', '4 - 10 ', '7 - 9 ', '3 - 6 ', '5 - 12 ',
       '3 - 8 ', '4 - 7 ', '2 - 10 ', '1 ', '6 - 8 ', '4 - 8 ', '5 - 7 ',
       '7 - 12 ', '5 - 6 ', '2 - 3 ', '8 - 9 ', '5 - 9 ', '1 - 2 ',
       '3 - 10 ', '3 - 5 ', '2 - 4 ', '3 - 4 ', '4 ', '7 - 8 ',
       '15 - 25 ', '3 - 9 ', '6 - 15 ', '20 - 30 ',
       '516.00 - 682.00 Per hour ', '10 - 12 ', '10 ', '5 - 10 ', '6 ',
       '9 - 11 ', '4 - 9 ', '4 - 5 ', '9 - 10 ', '15 - 20 ', '25 - 50 ',
       '50 - 80 ', '3 ', '7 ', '25 - 60 ', '25.00 Per hour ', '2 - 7 ',
       '12 ', '2 - 5 ', '1 - 7 ', '2 ', '5 ', '2 - 6 ', '96 - 1 ',
       '11 - 12 ', '6 - 11 ', '8 - 15 ', '35 '], dtype=object)

In [16]:
df = df[~df['Salary_Estimates'].str.contains('Per hour', na=False)]

In [17]:
df['Salary_Estimates'].unique()

array(['4 - 6 ', '6 - 7 ', '8 - 10 ', '6 - 10 ', '7 - 10 ', '5 - 8 ',
       '3 - 7 ', '6 - 9 ', '4 - 10 ', '7 - 9 ', '3 - 6 ', '5 - 12 ',
       '3 - 8 ', '4 - 7 ', '2 - 10 ', '1 ', '6 - 8 ', '4 - 8 ', '5 - 7 ',
       '7 - 12 ', '5 - 6 ', '2 - 3 ', '8 - 9 ', '5 - 9 ', '1 - 2 ',
       '3 - 10 ', '3 - 5 ', '2 - 4 ', '3 - 4 ', '4 ', '7 - 8 ',
       '15 - 25 ', '3 - 9 ', '6 - 15 ', '20 - 30 ', '10 - 12 ', '10 ',
       '5 - 10 ', '6 ', '9 - 11 ', '4 - 9 ', '4 - 5 ', '9 - 10 ',
       '15 - 20 ', '25 - 50 ', '50 - 80 ', '3 ', '7 ', '25 - 60 ',
       '2 - 7 ', '12 ', '2 - 5 ', '1 - 7 ', '2 ', '5 ', '2 - 6 ',
       '96 - 1 ', '11 - 12 ', '6 - 11 ', '8 - 15 ', '35 '], dtype=object)

#### Creating Min_Salary, Max_Salary, and Avg_Salary columns

In [19]:
has_hyphen = df['Salary_Estimates'].str.contains('-')
df_range = df[has_hyphen].copy()

In [20]:
salary_split = df_range['Salary_Estimates'].str.split('-', expand=True)

In [21]:
salary_split

Unnamed: 0,0,1
0,4,6
1,6,7
2,8,10
3,6,10
4,7,10
...,...,...
418,4,9
419,8,10
421,5,6
422,5,8


In [22]:
df_range['Min_Salary'] = pd.to_numeric(salary_split[0], errors='coerce')
df_range['Max_Salary'] = pd.to_numeric(salary_split[1], errors='coerce')

In [23]:
df_single = df[~has_hyphen].copy()

In [24]:
df_single['Min_Salary'] = pd.to_numeric(df_single['Salary_Estimates'].str.strip(), errors='coerce')
df_single['Max_Salary'] = df_single['Min_Salary'].copy()

In [25]:
df_preprocessed = pd.concat([df_range, df_single], ignore_index=True)

In [26]:
df_preprocessed['Avg_Salary'] = (df_preprocessed['Min_Salary'] + df_preprocessed['Max_Salary']) / 2

In [27]:
df = df.merge(df_preprocessed[['Min_Salary', 'Max_Salary', 'Avg_Salary']], left_index=True, right_index=True, how='left')

In [28]:
df

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,4 - 6,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,4.0,6.0,5.0
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,6 - 7,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),6.0,7.0,6.5
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,8 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),8.0,10.0,9.0
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,6 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),6.0,10.0,8.0
4,Global Planning Analyst,BP Energy,3.9,Pune,7 - 10,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),7.0,10.0,8.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,Senior Associate - Data Change Analyst,Macquarie Group Limited,3.8,Gurgaon,8 - 10,10000+ Employees,1969,Company - Public,Investment & Asset Management,Finance,$5 to $10 billion (USD),35.0,35.0,35.0
420,Senior Analyst - Data Analytics,Verint Systems Inc.,4.0,Bengaluru,10,1001 to 5000 Employees,1994,Company - Public,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,10.0,10.0,10.0
421,Manager - Data Analytics,1 Finance,4.9,Mumbai,5 - 6,51 to 200 Employees,2022,Company - Private,Investment & Asset Management,Finance,Unknown / Non-Applicable,10.0,10.0,10.0
422,Business Analyst ( 2+ years ),Bhanzu( formerly Exploring Infinities Edtech P...,3.9,Bengaluru,5 - 8,501 to 1000 Employees,2020,Company - Private,Primary & Secondary Schools,Education,Unknown / Non-Applicable,,,


In [29]:
df.isnull().sum()

Job_Title           0
Company_Name        0
Ratings             0
Location            0
Salary_Estimates    0
Company_Size        0
Founding_Year       0
Type                0
Industry            0
Sector              0
Revenue             0
Min_Salary          2
Max_Salary          2
Avg_Salary          2
dtype: int64

In [30]:
df.dropna(axis=0, inplace=True)

### Age of Company

In [32]:
df

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,4 - 6,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,4.0,6.0,5.0
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,6 - 7,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),6.0,7.0,6.5
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,8 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),8.0,10.0,9.0
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,6 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),6.0,10.0,8.0
4,Global Planning Analyst,BP Energy,3.9,Pune,7 - 10,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),7.0,10.0,8.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,Business Intelligence- MIS,5paisa Capital,3.5,Mumbai,4 - 7,201 to 500 Employees,--,Company - Public,--,--,Unknown / Non-Applicable,3.0,3.0,3.0
418,Financial Reporting Analyst,UPS India,3.5,Chennai,4 - 9,10000+ Employees,1907,Company - Public,Shipping & Trucking,Transportation and logistics,$10+ billion (USD),3.0,3.0,3.0
419,Senior Associate - Data Change Analyst,Macquarie Group Limited,3.8,Gurgaon,8 - 10,10000+ Employees,1969,Company - Public,Investment & Asset Management,Finance,$5 to $10 billion (USD),35.0,35.0,35.0
420,Senior Analyst - Data Analytics,Verint Systems Inc.,4.0,Bengaluru,10,1001 to 5000 Employees,1994,Company - Public,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,10.0,10.0,10.0


In [33]:
df.Founding_Year.unique()

array(['1999', '1908', '1973', '1869', '1850', '2005', '2015', '2014',
       '1997', '1988', '2012', '--', '1872', '1982', '2008', '2017',
       '1967', '2000', '1939', '2013', '1935', '2018', '1984', '1985',
       '1974', '2011', '1887', '1945', '2016', '2003', '1928', '1932',
       '1799', '1860', '1969', '1954', '1994', '1991', '2009', '1963',
       '1930', '2004', '1870', '2006', '1989', '1998', '1966', '1996',
       '1987', '1906', '1983', '2019', '1690', '1880', '2007', '2002',
       '2020', '1698', '1923', '1913', '1921', '2010', '1947', '1781',
       '1812', '1904', '1760', '1900', '1891', '1990', '1981', '1902',
       '1857', '1919', '2022', '1968', '1865', '2001', '1986', '1916',
       '1907'], dtype=object)

In [34]:
df[df.Founding_Year == '--']

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary
23,Data Analyst,HNM Solutions,4.3,Pune,3 - 6,51 to 200 Employees,--,Company - Private,--,--,Unknown / Non-Applicable,3.0,6.0,4.5
31,Associate Data Analyst,Optum,3.5,Hyderābād,2 - 10,10000+ Employees,--,Company - Public,Healthcare Services & Hospitals,Healthcare,$10+ billion (USD),2.0,10.0,6.0
33,"Sustainability Analyst, Data and Reporting",JLL,3.8,Bengaluru,6 - 8,10000+ Employees,--,Company - Public,Real estate,Real estate,$5 to $10 billion (USD),4.0,8.0,6.0
44,Data Analyst,Optum,3.5,Hyderābād,2 - 10,10000+ Employees,--,Company - Public,Healthcare Services & Hospitals,Healthcare,$10+ billion (USD),5.0,6.0,5.5
46,Junior Data Analyst (F),coolboots,2.6,Gurgaon,4 - 8,51 to 200 Employees,--,Company - Private,--,--,Unknown / Non-Applicable,4.0,8.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,Senior Finance Data Analyst,Keyloop,4.3,Hyderābād,2 - 6,Unknown,--,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable,5.0,8.0,6.5
397,Data Visualisation Analyst,Pulsus Group,4.2,Hyderābād,2 - 4,Unknown,--,Company - Private,Publishing,Media and communication,Unknown / Non-Applicable,4.0,8.0,6.0
408,AI Data Analyst,Techwyse IT Solutions Private Limited,3.8,Cochin,35,Unknown,--,Company - Public,--,--,Unknown / Non-Applicable,4.0,4.0,4.0
410,Business Analyst (MSI),Maruti Suzuki India Ltd,3.8,Gurgaon,4 - 7,10000+ Employees,--,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$10+ billion (USD),7.0,7.0,7.0


In [35]:
df['Founding_Year'] = df['Founding_Year'].replace('--', -1)

In [36]:
df['Founding_Year'] = df['Founding_Year'].astype(int)

In [37]:
# df['Age'] = 2025 - df['Founding_Year']
df['Age'] = df['Founding_Year'].apply(lambda x:x if x<1 else 2025-x)

In [79]:
df.head()

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary,Age
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,4 - 6,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,4.0,6.0,5.0,26
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,6 - 7,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),6.0,7.0,6.5,117
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,8 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),8.0,10.0,9.0,52
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,6 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),6.0,10.0,8.0,52
4,Global Planning Analyst,BP Energy,3.9,Pune,7 - 10,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),7.0,10.0,8.5,117


In [39]:
df['Age'].sort_values()

91      -1
109     -1
369     -1
107     -1
367     -1
      ... 
210    327
218    335
278    335
177    335
371    335
Name: Age, Length: 420, dtype: int64

In [77]:
df[df['Age'] == 335]

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary,Age
177,Business Intelligence Analyst,Barclays,3.9,Pune,9 - 11,10000+ Employees,1690,Company - Public,Banking & Lending,Finance,$10+ billion (USD),4.0,5.0,4.5,335
218,Business Analyst,Barclays,3.9,Pune,7 - 8,10000+ Employees,1690,Company - Public,Banking & Lending,Finance,$10+ billion (USD),5.0,8.0,6.5,335
278,Data Analyst,Barclays,3.9,Noida,8 - 9,10000+ Employees,1690,Company - Public,Banking & Lending,Finance,$10+ billion (USD),4.0,5.0,4.5,335
371,Research Analyst - BA4,Barclays,3.9,Mumbai,8 - 9,10000+ Employees,1690,Company - Public,Banking & Lending,Finance,$10+ billion (USD),15.0,20.0,17.5,335


### 

In [83]:
df.head()

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary,Age
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,4 - 6,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,4.0,6.0,5.0,26
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,6 - 7,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),6.0,7.0,6.5,117
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,8 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),8.0,10.0,9.0,52
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,6 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),6.0,10.0,8.0,52
4,Global Planning Analyst,BP Energy,3.9,Pune,7 - 10,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),7.0,10.0,8.5,117


In [90]:
df['Company_Size'].unique()

array(['201 to 500 Employees', '10000+ Employees',
       '1001 to 5000 Employees', '501 to 1000 Employees',
       '51 to 200 Employees', '5001 to 10000 Employees',
       '1 to 50 Employees', 'Unknown'], dtype=object)

In [85]:
df['Type'].unique()

array(['Company - Private', 'Company - Public',
       'Subsidiary or Business Segment', 'Unknown',
       'Private Practice / Firm', 'Self-employed', 'Contract',
       'College / University', 'Non-profit Organisation'], dtype=object)

In [87]:
df['Industry'].unique()

array(['Information Technology Support Services', 'Energy & Utilities',
       'Biotech & Pharmaceuticals', 'Investment & Asset Management',
       'Business consulting', 'Real estate', 'Internet & Web Services',
       '--', 'Consumer Product Manufacturing',
       'Healthcare Services & Hospitals',
       'Enterprise Software & Network Solutions', 'Software Development',
       'Stock Exchanges', 'Financial Transaction Processing',
       'Computer Hardware Development', 'Research and development',
       'HR Consulting', 'Transportation Equipment Manufacturing',
       'Accounting & Tax', 'Rail Transportation', 'Banking & Lending',
       'Motor Vehicle Dealers', 'Architectural & Engineering Services',
       'Food & Beverage Manufacturing', 'Electronics Manufacturing',
       'Education Support & Training Services', 'Chemical Manufacturing',
       'Insurance Carriers', 'Machinery Manufacturing',
       'Video Game Publishing', 'Advertising & Public Relations',
       'Primary & Se

In [94]:
df['Sector'].unique()

array(['Information Technology', 'Energy, mining, utilities',
       'Pharmaceutical and biotechnology', 'Finance',
       'Management and consulting', 'Real estate', '--', 'Manufacturing',
       'Healthcare', 'Human resources and staffing',
       'Transportation and logistics', 'Retail and wholesale',
       'Construction, repair and maintenance', 'Education', 'Insurance',
       'Media and communication', 'Arts, entertainment and recreation',
       'Hotel and travel accommodation', 'Telecommunications'],
      dtype=object)

In [96]:
df['Revenue'].unique()

array(['Unknown / Non-Applicable', '$10+ billion (USD)',
       '$100 to $500 million (USD)', '$500 million to $1 billion (USD)',
       '$25 to $50 million (USD)', '$5 to $10 billion (USD)',
       '$1 to $5 million (USD)', 'Less than $1 million (USD)',
       '$2 to $5 billion (USD)', '$5 to $25 million (USD)'], dtype=object)

In [98]:
df.to_csv('Cleaned_Data.csv', index=False)

In [104]:
pd.read_csv('Cleaned_Data.csv')

Unnamed: 0,Job_Title,Company_Name,Ratings,Location,Salary_Estimates,Company_Size,Founding_Year,Type,Industry,Sector,Revenue,Min_Salary,Max_Salary,Avg_Salary,Age
0,"Data Analyst (Kafka, API)",ITI Data,4.0,Hyderābād,4 - 6,201 to 500 Employees,1999,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,4.0,6.0,5.0,26
1,FBT Data Operations - Document Control & Engin...,BP Energy,3.9,Pune,6 - 7,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),6.0,7.0,6.5,117
2,Business system owner Order management – Senio...,Sanofi EU,4.0,Hyderābād,8 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),8.0,10.0,9.0,52
3,Vendor Master Data Management Analyst,Sanofi EU,4.0,Hyderābād,6 - 10,10000+ Employees,1973,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical and biotechnology,$10+ billion (USD),6.0,10.0,8.0,52
4,Global Planning Analyst,BP Energy,3.9,Pune,7 - 10,10000+ Employees,1908,Company - Public,Energy & Utilities,"Energy, mining, utilities",$10+ billion (USD),7.0,10.0,8.5,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,Business Intelligence- MIS,5paisa Capital,3.5,Mumbai,4 - 7,201 to 500 Employees,-1,Company - Public,--,--,Unknown / Non-Applicable,3.0,3.0,3.0,-1
416,Financial Reporting Analyst,UPS India,3.5,Chennai,4 - 9,10000+ Employees,1907,Company - Public,Shipping & Trucking,Transportation and logistics,$10+ billion (USD),3.0,3.0,3.0,118
417,Senior Associate - Data Change Analyst,Macquarie Group Limited,3.8,Gurgaon,8 - 10,10000+ Employees,1969,Company - Public,Investment & Asset Management,Finance,$5 to $10 billion (USD),35.0,35.0,35.0,56
418,Senior Analyst - Data Analytics,Verint Systems Inc.,4.0,Bengaluru,10,1001 to 5000 Employees,1994,Company - Public,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,10.0,10.0,10.0,31
