In [1]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv("alldata.csv")

In [2]:
data

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
...,...,...,...,...,...
6959,Data Developer / Machine Learning Analyst,NetApp,Are you data-driven? We at NetApp believe in t...,574.0,"Sunnyvale, CA"
6960,Scientist I,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,26.0,"Sunnyvale, CA"
6961,Intern Scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...",5.0,"Sunnyvale, CA"
6962,Senior Data & Applied Scientist,Microsoft,We are the Bing Core Relevance team responsibl...,4618.0,"Sunnyvale, CA"


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6964 entries, 0 to 6963
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   position     6953 non-null   object 
 1   company      6953 non-null   object 
 2   description  6953 non-null   object 
 3   reviews      5326 non-null   float64
 4   location     6953 non-null   object 
dtypes: float64(1), object(4)
memory usage: 272.2+ KB


In [4]:
data = data.dropna(how = 'all')

## 1. Location

In [5]:
data['location'].unique

<bound method Series.unique of 0        Atlanta, GA 30301 
1               Atlanta, GA
2               Atlanta, GA
3        Atlanta, GA 30303 
4               Atlanta, GA
               ...         
6959          Sunnyvale, CA
6960          Sunnyvale, CA
6961          Sunnyvale, CA
6962          Sunnyvale, CA
6963    Sunnyvale, CA 94089
Name: location, Length: 6953, dtype: object>

In [6]:
data['location'].str.extract(r'([\D]+ ?[\D]+, \D\D)')

Unnamed: 0,0
0,"Atlanta, GA"
1,"Atlanta, GA"
2,"Atlanta, GA"
3,"Atlanta, GA"
4,"Atlanta, GA"
...,...
6959,"Sunnyvale, CA"
6960,"Sunnyvale, CA"
6961,"Sunnyvale, CA"
6962,"Sunnyvale, CA"


In [7]:
data['location_clean'] = data['location'].str.extract(r'([\D]+ ?[\D]+, \D\D)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['location_clean'] = data['location'].str.extract(r'([\D]+ ?[\D]+, \D\D)')


In [8]:
data.head()

Unnamed: 0,position,company,description,reviews,location,location_clean
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301","Atlanta, GA"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA","Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA","Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303","Atlanta, GA"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA","Atlanta, GA"


## Cleaning the set

### Clean Position

In [9]:
def position_mapper(x):
    x = x.lower()
    if 'data scientist' in x or 'scientist' in x or 'data science' in x:
        return 'data scientist'
    elif 'data engineer'in x or 'data engineering' in x:
        return 'data engineer'
    elif 'data analyst' in x or 'research analyst' in x or 'analyst' in x or 'data manager' in x:
        return 'data analyst'
    elif 'software engineer' in x or 'software developer' in x:
        return 'software engineer'
    else:
        return 'other role'

data['clean_position'] = data['position'].apply(position_mapper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_position'] = data['position'].apply(position_mapper)


In [10]:
data['clean_position'].value_counts()

data scientist       2823
other role           2759
data analyst          851
software engineer     343
data engineer         177
Name: clean_position, dtype: int64

### Clean Location

In [11]:
def get_city(x): 
    if x == x:
        return x.split(",")[0]
    else: 
        return 'none'

In [12]:
data['location'].apply(get_city)

0         Atlanta
1         Atlanta
2         Atlanta
3         Atlanta
4         Atlanta
          ...    
6959    Sunnyvale
6960    Sunnyvale
6961    Sunnyvale
6962    Sunnyvale
6963    Sunnyvale
Name: location, Length: 6953, dtype: object

In [13]:
data['city'] = data['location'].apply(get_city)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['city'] = data['location'].apply(get_city)


In [14]:
def get_state(x): 
    if x == x:
        return x.split(",")[1].strip()[:2]
    else: 
        return 'none'
    
data['location'].apply(get_state)

0       GA
1       GA
2       GA
3       GA
4       GA
        ..
6959    CA
6960    CA
6961    CA
6962    CA
6963    CA
Name: location, Length: 6953, dtype: object

In [15]:
data['state'] = data['location'].apply(get_state)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['state'] = data['location'].apply(get_state)


### Clean Descriptions

We will be looking for these skills:
- python
- statistics
- modeling
- programming
- SQL
- excel
- java

We will be looking for these characteristics:
- Bachelor's Degree
- Master's Degree
- PhD
- Experience in years

In [16]:
data['description'].head(20)

0     Development Director\nALS Therapy Development ...
1     Job Description\n\n"The road that leads to acc...
2     Growing company located in the Atlanta, GA are...
3     DEPARTMENT: Program OperationsPOSITION LOCATIO...
4     DESCRIPTION\nThe Emory University Department o...
5     Qualifications\nBachelor’s degree in Computer ...
6     Qualifications\nBachelor’s degree\n5-7 years o...
7     Overview / Responsibilities\nWood Environment ...
8     Works closely with senior CIB professionals. P...
9     Known for being a great place to work and buil...
10    :\n\nThe Statistical Forecast Analyst position...
11    Chenega Professional &amp; Technical Services,...
12    Innovate. Collaborate. Shine. Lighthouse — KPM...
13    DESCRIPTION\nUnder minimal supervision, the As...
14    Data Science Analyst– Business Intelligence\nL...
15    Cotiviti is looking for an industry leading Da...
16    Description\nExecutes complex assignments requ...
17    Overview\n\n\n\nAt Perficient you’ll deliv

### Use regex to extract different skills like python from description

For each column I have used .extract first to check the value groups e.g. Python, python, PYTHON.
After that I used .contains to create a column that returns True or False

In [17]:
data['python'] = data['description'].str.extract(r'(\bpython\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['python'] = data['description'].str.extract(r'(\bpython\b)', flags = re.IGNORECASE)


In [18]:
data['python'].value_counts()

Python    2707
python     114
PYTHON       2
Name: python, dtype: int64

In [19]:
data['python'] = data['description'].str.contains(r'(\bpython\b)', flags = re.IGNORECASE)

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['python'] = data['description'].str.contains(r'(\bpython\b)', flags = re.IGNORECASE)


In [20]:
data['statistics'] = data['description'].str.extract(r'(\bstatistics\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['statistics'] = data['description'].str.extract(r'(\bstatistics\b)', flags = re.IGNORECASE)


In [21]:
data['statistics'].value_counts()

statistics    1143
Statistics     869
Name: statistics, dtype: int64

In [22]:
data['statistics'] = data['description'].str.contains(r'(\bstatistics\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['statistics'] = data['description'].str.contains(r'(\bstatistics\b)', flags = re.IGNORECASE)


In [23]:
data['mathematics'] = data['description'].str.extract(r'(\bmathematic)', flags = re.IGNORECASE)
data['mathematics'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mathematics'] = data['description'].str.extract(r'(\bmathematic)', flags = re.IGNORECASE)


mathematic    750
Mathematic    699
MATHEMATIC      1
Name: mathematics, dtype: int64

In [24]:
data['mathematics'] = data['description'].str.contains(r'(\bmathematic)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['mathematics'] = data['description'].str.contains(r'(\bmathematic)', flags = re.IGNORECASE)


In [25]:
data['java'] = data['description'].str.extract(r'(\bjava\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['java'] = data['description'].str.extract(r'(\bjava\b)', flags = re.IGNORECASE)


In [26]:
data['java'].value_counts()

Java    1209
JAVA      17
java       9
Name: java, dtype: int64

In [27]:
data['java'] = data['description'].str.contains(r'(\bjava\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['java'] = data['description'].str.contains(r'(\bjava\b)', flags = re.IGNORECASE)


In [28]:
data['sql'] = data['description'].str.extract(r'(\bSQL\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sql'] = data['description'].str.extract(r'(\bSQL\b)', flags = re.IGNORECASE)


In [29]:
data['sql'].value_counts()

SQL    1676
sql       9
Sql       5
SQl       1
Name: sql, dtype: int64

In [30]:
data['sql'] = data['description'].str.contains(r'(\bsql\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sql'] = data['description'].str.contains(r'(\bsql\b)', flags = re.IGNORECASE)


In [31]:
data['excel'] = data['description'].str.extract(r'(\bexcel\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['excel'] = data['description'].str.extract(r'(\bexcel\b)', flags = re.IGNORECASE)


In [32]:
data['excel'].value_counts()

Excel    862
excel    122
EXCEL      7
Name: excel, dtype: int64

In [33]:
data['excel'] = data['description'].str.contains(r'(\bexcel\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['excel'] = data['description'].str.contains(r'(\bexcel\b)', flags = re.IGNORECASE)


In [34]:
data['modeling'] = data['description'].str.extract(r'(\bmodeling\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['modeling'] = data['description'].str.extract(r'(\bmodeling\b)', flags = re.IGNORECASE)


In [35]:
data['modeling'] = data['description'].str.contains(r'(\bmodeling\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['modeling'] = data['description'].str.contains(r'(\bmodeling\b)', flags = re.IGNORECASE)


In [36]:
data['statistics'] = data['description'].str.extract(r'(\bstatistics\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['statistics'] = data['description'].str.extract(r'(\bstatistics\b)', flags = re.IGNORECASE)


In [37]:
data['statistics'] = data['description'].str.contains(r'(\bstatistics\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['statistics'] = data['description'].str.contains(r'(\bstatistics\b)', flags = re.IGNORECASE)


In [38]:
data.head()

Unnamed: 0,position,company,description,reviews,location,location_clean,clean_position,city,state,python,statistics,mathematics,java,sql,excel,modeling
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301","Atlanta, GA",other role,Atlanta,GA,False,False,False,False,False,False,False
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA","Atlanta, GA",data scientist,Atlanta,GA,False,True,True,False,False,False,True
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA","Atlanta, GA",data scientist,Atlanta,GA,True,False,False,True,True,False,True
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303","Atlanta, GA",data analyst,Atlanta,GA,True,False,False,False,True,False,True
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA","Atlanta, GA",other role,Atlanta,GA,False,False,False,False,False,False,False


In [39]:
data['c++'] = data['description'].str.extract(r'([.C]\+\+)', flags = re.IGNORECASE)
data['c++'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['c++'] = data['description'].str.extract(r'([.C]\+\+)', flags = re.IGNORECASE)


C++    966
c++      1
Name: c++, dtype: int64

In [40]:
data['c++'] = data['description'].str.contains(r'([.C]\+\+)', flags = re.IGNORECASE)

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['c++'] = data['description'].str.contains(r'([.C]\+\+)', flags = re.IGNORECASE)


In [41]:
data['r'] = data['description'].str.extract(r'(\s[R]\s)', flags = re.IGNORECASE)
data['r'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['r'] = data['description'].str.extract(r'(\s[R]\s)', flags = re.IGNORECASE)


 R      515
 R\n     93
\nR       6
 r        3
Name: r, dtype: int64

In [42]:
data['r'] = data['description'].str.contains(r'(\s[.R]\s)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['r'] = data['description'].str.contains(r'(\s[.R]\s)', flags = re.IGNORECASE)


In [43]:
data['visualization'] = data['description'].str.extract(r'(\bvisualization\b)', flags = re.IGNORECASE)
data['visualization'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['visualization'] = data['description'].str.extract(r'(\bvisualization\b)', flags = re.IGNORECASE)


visualization    798
Visualization     68
Name: visualization, dtype: int64

In [44]:
data['visualization'] = data['description'].str.contains(r'(\bvisualization\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['visualization'] = data['description'].str.contains(r'(\bvisualization\b)', flags = re.IGNORECASE)


In [45]:
data['ml'] = data['description'].str.extract(r'(\bmachine\slearning\b)', flags = re.IGNORECASE)
data['ml'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ml'] = data['description'].str.extract(r'(\bmachine\slearning\b)', flags = re.IGNORECASE)


machine learning     1778
Machine Learning      621
Machine learning       44
MACHINE LEARNING        3
machine\nlearning       1
Name: ml, dtype: int64

In [46]:
data['ml'] = data['description'].str.contains(r'(\bmachine\slearning\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ml'] = data['description'].str.contains(r'(\bmachine\slearning\b)', flags = re.IGNORECASE)


In [47]:
data['ai'] = data['description'].str.extract(r'(\bartificial\sintelligence\b)', flags = re.IGNORECASE)
data['ai'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ai'] = data['description'].str.extract(r'(\bartificial\sintelligence\b)', flags = re.IGNORECASE)


artificial intelligence    286
Artificial Intelligence    192
Artificial intelligence      6
Name: ai, dtype: int64

In [48]:
data['ai'] = data['description'].str.contains(r'(\bartificial\sintelligence\b)', flags = re.IGNORECASE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ai'] = data['description'].str.contains(r'(\bartificial\sintelligence\b)', flags = re.IGNORECASE)


### Get the different degrees (graduation titles) from the description

Could have done this with regex probably, but decided to try a different mechanism:
1. Replaced null values in the description column, so there are no more floats
2. Created a function that loops through the column and checks for degree titles saved in some lists
3. Apply this as a lamda function to create a new column that says true or false if the description includes a degree

In [49]:
data['description'] = data['description'].fillna('No description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['description'] = data['description'].fillna('No description')


In [50]:
data['reviews'] = data['reviews'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviews'] = data['reviews'].fillna(0)


In [51]:
data.shape

(6953, 21)

In [52]:
#Create a function that takes a column and a list as input and returns true or false if it finds the value or not

phd_degrees = ["phd", "ph.d.", "dr"]
bachelor_degrees = ["bachelor", "bachelors's", "BA", "B.Sc", "BSc"]
master_degrees = ["master", "masters's", "MA", "M.Sc", "MSc"]

def degree_mapper(x, degree_list):
    x = x.lower()
    for degree in degree_list:
        return degree in x
            

In [53]:
#Use a lambda function to apply the function to the column per list
#Just using the function with .apply would not work, since it would only accept one input, but the function has 2

data['has_phd'] = data['description'].apply(lambda x: degree_mapper(x, phd_degrees))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['has_phd'] = data['description'].apply(lambda x: degree_mapper(x, phd_degrees))


In [54]:
data['has_phd'].value_counts()

False    5229
True     1724
Name: has_phd, dtype: int64

In [55]:
data['has_bachelor'] = data['description'].apply(lambda x: degree_mapper(x, bachelor_degrees))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['has_bachelor'] = data['description'].apply(lambda x: degree_mapper(x, bachelor_degrees))


In [56]:
data['has_bachelor'].value_counts()

False    4833
True     2120
Name: has_bachelor, dtype: int64

In [57]:
data['has_master'] = data['description'].apply(lambda x: degree_mapper(x, master_degrees))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['has_master'] = data['description'].apply(lambda x: degree_mapper(x, master_degrees))


In [58]:
data['has_master'].value_counts()

False    4904
True     2049
Name: has_master, dtype: int64

In [59]:
data.head()

Unnamed: 0,position,company,description,reviews,location,location_clean,clean_position,city,state,python,...,excel,modeling,c++,r,visualization,ml,ai,has_phd,has_bachelor,has_master
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,0.0,"Atlanta, GA 30301","Atlanta, GA",other role,Atlanta,GA,False,...,False,False,False,False,False,False,False,False,True,False
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",0.0,"Atlanta, GA","Atlanta, GA",data scientist,Atlanta,GA,False,...,False,True,False,False,False,False,False,False,False,True
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",0.0,"Atlanta, GA","Atlanta, GA",data scientist,Atlanta,GA,True,...,False,True,True,False,True,True,False,False,False,True
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303","Atlanta, GA",data analyst,Atlanta,GA,True,...,False,True,False,False,True,False,False,False,True,False
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA","Atlanta, GA",other role,Atlanta,GA,False,...,False,False,False,False,False,True,False,True,False,False


In [60]:
data_clean = data.drop(['description','location_clean','location'],axis=1)

In [61]:
data_clean.reset_index(drop=True, inplace=True)

In [62]:
data_clean

Unnamed: 0,position,company,reviews,clean_position,city,state,python,statistics,mathematics,java,...,excel,modeling,c++,r,visualization,ml,ai,has_phd,has_bachelor,has_master
0,Development Director,ALS TDI,0.0,other role,Atlanta,GA,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,0.0,data scientist,Atlanta,GA,False,True,True,False,...,False,True,False,False,False,False,False,False,False,True
2,Data Scientist,Xpert Staffing,0.0,data scientist,Atlanta,GA,True,False,False,True,...,False,True,True,False,True,True,False,False,False,True
3,Data Analyst,Operation HOPE,44.0,data analyst,Atlanta,GA,True,False,False,False,...,False,True,False,False,True,False,False,False,True,False
4,Assistant Professor -TT - Signal Processing & ...,Emory University,550.0,other role,Atlanta,GA,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6948,Data Developer / Machine Learning Analyst,NetApp,574.0,data analyst,Sunnyvale,CA,True,False,False,True,...,True,False,False,False,False,True,False,False,True,False
6949,Scientist I,"Pharmacyclics, an Abbvie Company",26.0,data scientist,Sunnyvale,CA,False,False,False,False,...,True,False,False,True,False,False,False,False,True,True
6950,Intern Scientist,Oath Inc,5.0,data scientist,Sunnyvale,CA,True,True,True,True,...,False,False,True,False,False,True,False,True,False,False
6951,Senior Data & Applied Scientist,Microsoft,4618.0,data scientist,Sunnyvale,CA,False,False,False,True,...,False,False,True,False,False,True,False,False,False,False


In [65]:
data_clean.to_csv('data_cleaned.csv',index=False)

In [66]:
!ls

DataJobs Cleaning.ipynb    alldata.csv
GroupLab1_city&state.ipynb alldata.xlsx
README.md                  data_cleaned.csv
alldata.cleaning.xlsx
