In [1]:
import pandas as pd

In [2]:
df_raw = pd.read_csv('data/gsearch_jobs.csv')

In [None]:
df_raw.head()
df_raw.info()
df_raw.columns

Creating a clean data frame only containing descriptions and data_time
It will be used for tokenization later

In [3]:
#cleaning section for skill mapping. drop all but description and data_time
df_prep_token = df_raw.drop(
    ['Unnamed: 0', 'company_name', 'location', 'via',
       'extensions', 'job_id', 'thumbnail', 'posted_at',
       'schedule_type', 'work_from_home', 'salary', 'search_term',
       'search_location', 'commute_time', 'salary_pay', 'salary_rate',
       'salary_avg', 'salary_min', 'salary_max', 'salary_hourly',
       'salary_yearly', 'salary_standardized', 'description_tokens'],
      axis = 1
      )

In [4]:
df_prep_token.head()

Unnamed: 0,index,title,description,date_time
0,0,Data Analyst,"As the leader in cloud-managed IT, Cisco Merak...",2023-08-02 03:00:13.054897
1,1,Entry Level - Business Data Analyst (Remote),As a Senior Business Analyst you will contribu...,2023-08-02 03:00:13.054897
2,2,Data Analyst/Researcher,Overview:\n\nAmyx is seeking to hire a Data An...,2023-08-02 03:00:13.054897
3,3,Data analyst/AI expert to help build a website...,I am looking for someone to help me build an A...,2023-08-02 03:00:13.054897
4,4,Data Analyst,Position Vacancy – Data Analyst to support the...,2023-08-02 03:00:13.054897


In [5]:
df_prep_token["title"] = df_prep_token["title"].apply(lambda x: x.lower() if isinstance(x, str) else x)
df_prep_token["description"] = df_prep_token["description"].apply(lambda x: x.lower() if isinstance(x, str) else x)

df_prep_token["date_time"] = pd.to_datetime(df_prep_token["date_time"])

df_prep_token["is_analyst"] = df_prep_token["title"].str.contains("analyst", case=False) & ~ (df_prep_token["title"].str.contains("business analyst", case=False) | df_prep_token["title"].str.contains("bi analyst", case=False)) # analyst but not business analyst
df_prep_token = df_prep_token[(df_prep_token["is_analyst"] == True)]

df_prep_token = df_prep_token.drop(["title", "is_analyst"], axis = 1)

df_prep_token["description"] = df_prep_token["description"].astype("string")

df_prep_token.info()
df_prep_token.head()

<class 'pandas.core.frame.DataFrame'>
Index: 32022 entries, 0 to 42150
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   index        32022 non-null  int64         
 1   description  32022 non-null  string        
 2   date_time    32022 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), string(1)
memory usage: 1000.7 KB


Unnamed: 0,index,description,date_time
0,0,"as the leader in cloud-managed it, cisco merak...",2023-08-02 03:00:13.054897
1,1,as a senior business analyst you will contribu...,2023-08-02 03:00:13.054897
2,2,overview: amyx is seeking to hire a data anal...,2023-08-02 03:00:13.054897
3,3,i am looking for someone to help me build an a...,2023-08-02 03:00:13.054897
4,4,position vacancy – data analyst to support the...,2023-08-02 03:00:13.054897


In [None]:
df_prep_token.to_csv("data/df_clean_for_token.csv")

In [None]:
#cleaning section
df = df_raw.drop(
    ["job_id",
      "thumbnail" ,
      "commute_time",
      "salary_pay", # as we have min-max
      "salary_hourly",
      "Unnamed: 0",
      "posted_at",
       ],
      axis = 1
      )
df.tail()

In [None]:
# converting to types, "description_tokens" still object
df[["title", "description", "company_name", "location", "via", "extensions", "schedule_type", "search_term", "search_location", "salary", "salary_rate"]] = df[["title", "description", "company_name", "location", "via", "extensions", "schedule_type", "search_term", "search_location", "salary", "salary_rate"]].astype("string")
df["work_from_home"] = df["work_from_home"].astype("bool")
df["date_time"] = pd.to_datetime(df["date_time"])
df.info()

In [None]:
#covert to lower strings
# title and description changed back to object type --> "Data Analyst II" is now "data analyst ii"
df["title"] = df["title"].apply(lambda x: x.lower() if isinstance(x, str) else x)
df["description"] = df["description"].apply(lambda x: x.lower() if isinstance(x, str) else x)

# SKILLNER package cant  handle back ticks (’), replacing in df.description
df["description_cleaned"] = df["description"].replace('’', " ")

df["via"] = df["via"].apply(lambda x: x.lower() if isinstance(x, str) else x)

df[["title", "description","description_cleaned", "via"]] = df[["title", "description","description_cleaned", "via"]].astype("string")
df.head()
#df.info()

In [None]:
#Getting to know the dataset
display(df["location"].value_counts())
germany = df["location"].value_counts().get('Germany')
print(germany)
display(df["title"].value_counts())

In [None]:
# checking how many data analyst and scientists
# manually adjusting the filter 
df["is_analyst"] = df["title"].str.contains("analyst", case=False) & ~ (df["title"].str.contains("business analyst", case=False) | df["title"].str.contains("bi analyst", case=False)) # analyst but not business analyst
print(df["is_analyst"].sum())
df["is_bi_analyst"] = df["title"].str.contains("business data analyst", case=False)
print(df["is_bi_analyst"].sum())
df["is_scien"] = df["title"].str.contains("scient" or "scienc", case=False)
print(df["is_scien"].sum())
df.tail()

In [None]:
# testing the filters
# filtering for df["is_analyst"] based on boolean indexing when df[df["is_analyst"]] is true
filtered_df = df[df["is_analyst"]]

# manual inspection
display(filtered_df[["title", "is_analyst"]])

In [None]:
#where jobs are from, glassdoor = 0, 
#FIX ME , there was the word indeed, but doesnt show here
df["via"].str.contains("indeed").sum()

df["via"].str.contains("glassdoor").sum()
indexes_with_glassdoor = df[df["via"].str.contains("glassdoor")].index
print(indexes_with_glassdoor)
df["via"].loc[12579]
df["date_time"].loc[12444]

In [None]:
# FINAL FILTER, create df_cleaned, finally create df_main
# Decision to integrate every title that is analyst but not business analyst or bi analyst which still can overlap with some other titles that are also in the same title description
df_cleaned = df[(df["is_analyst"] == True)]

print(df_cleaned.shape)

df_main = df_cleaned.copy()

In [None]:
#starting tokenization
#a random list of skills to check if tokenization works and for first impressions of the df
skills_list = ["python", "sql", "tableau", "bi tool", "power bi", "aws", "azure", "excel", "powerpoint"]# R als entity!!!  # start with these two skills
#r'\b(r)\b'   --> how to extract the stand-alone r

# create new columns for each skill
for skill in skills_list:
    df_main[skill] = df_main['description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0)

In [None]:
df_main[["python", "sql", "tableau", "aws", "azure", "excel", "powerpoint", "power bi"]].sum(axis=0).sort_values(ascending = False)

In [None]:
df_main.to_csv("data/gsearch_cleaned.csv")