## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import string
import re
import emoji

import warnings
warnings.filterwarnings('ignore')

## Read files

In [2]:
articles_df = pd.read_csv("tds.csv", index_col=0)

In [3]:
articles_df.head()

Unnamed: 0,publish_date,title,author,url,claps,responses,reading_time,paid,content
0,2021-04-30,Better Software Writing Skills in Data Science...,Pascal Potvin,https://towardsdatascience.com/better-software...,1,0,5,0,Pascal Potvin Apr 30·5 min read This week’s me...
1,2021-04-30,Forecasting at scale with Facebook Prophet — a...,Konrad Hoppe,https://towardsdatascience.com/forecasting-at-...,12,0,5,0,Konrad Hoppe Apr 30·5 min read Time series for...
2,2021-04-30,Five Subtle Pitfalls 99% Of Junior Python Deve...,Mohammed Ayar,https://towardsdatascience.com/five-subtle-pit...,366,5,6,1,Mohammed Ayar Apr 30·6 min read If data scienc...
3,2021-04-30,Four Deep Learning Papers to Read in May 2021,Robert Lange,https://towardsdatascience.com/four-deep-learn...,371,2,6,1,Robert Lange Apr 30·6 min read Welcome to the ...
4,2021-04-30,Lessons from the First Two Data Scientists at ...,Matt Sosna,https://towardsdatascience.com/lessons-from-th...,140,0,10,1,Matt Sosna Apr 30·10 min read Data science in ...


In [4]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42660 entries, 0 to 42659
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publish_date  42660 non-null  object
 1   title         42660 non-null  object
 2   author        42660 non-null  object
 3   url           42660 non-null  object
 4   claps         42660 non-null  int64 
 5   responses     42660 non-null  int64 
 6   reading_time  42660 non-null  int64 
 7   paid          42660 non-null  int64 
 8   content       42322 non-null  object
dtypes: int64(4), object(5)
memory usage: 3.3+ MB


In [5]:
jobs_listing_df = pd.read_csv("DataScientist.csv", index_col=0)

In [6]:
jobs_listing_df.head()

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1
1,1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1
2,2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True
3,3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1
4,4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1


In [7]:
jobs_listing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3909 entries, 0 to 3908
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              3909 non-null   int64  
 1   Job Title          3909 non-null   object 
 2   Salary Estimate    3909 non-null   object 
 3   Job Description    3909 non-null   object 
 4   Rating             3909 non-null   float64
 5   Company Name       3909 non-null   object 
 6   Location           3909 non-null   object 
 7   Headquarters       3909 non-null   object 
 8   Size               3909 non-null   object 
 9   Founded            3909 non-null   int64  
 10  Type of ownership  3909 non-null   object 
 11  Industry           3909 non-null   object 
 12  Sector             3909 non-null   object 
 13  Revenue            3909 non-null   object 
 14  Competitors        3909 non-null   object 
 15  Easy Apply         3909 non-null   object 
dtypes: float64(1), int64(2),

## Common methods to clean data

In [8]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [9]:
def remove_escape_special_chars(text):
    for ch in ['\'','\"','\\','\n','\r', '\t','\b','\f','\ooo', '·', '•']:
        if ch in text:
            text = text.replace(ch, "")
    return text

In [10]:
def remove_emojis(df_col):
    df_col = df_col.str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
    return df_col

In [11]:
def remove_col_content(df, col_to_remove, ref_col):
    return df.apply(lambda row : row[col_to_remove].replace(str(row[ref_col]), ''), axis=1)

## Clean data for articles dataframe

In [12]:
# remove rows that contain missing content data
articles_df = articles_df[articles_df['content'].notna()]

In [13]:
# make all words in content to be lowercase
articles_df['cleaned_content'] = articles_df['content'].str.lower()

In [14]:
# make all words in author to be lowercase
articles_df['cleaned_author'] = articles_df['author'].str.lower()

In [15]:
# remove punctuations in content
articles_df["cleaned_content"] = articles_df["cleaned_content"].apply(remove_punctuations)

In [16]:
# remove escape and special characters in content
articles_df["cleaned_content"] = articles_df["cleaned_content"].apply(remove_escape_special_chars)

In [17]:
# remove emojis in content
articles_df['cleaned_content'] = remove_emojis(articles_df['cleaned_content'])

In [18]:
# remove author names inside articles content
articles_df['cleaned_content'] = remove_col_content(articles_df, 'cleaned_content', 'cleaned_author')

In [19]:
# remove the first few words until the word read
# this phrase only states how long a person will take to read the article
# therefore this phrase is irrelevant to the keyword

articles_df['cleaned_content'] = articles_df['cleaned_content'].str.split(r'read').str[1:].str.join(" ")

In [20]:
# remove duplicated articles content
duplicated_articles = articles_df[articles_df.duplicated(['cleaned_content'])]
articles_df.drop(duplicated_articles.index, inplace=True)
articles_df

Unnamed: 0,publish_date,title,author,url,claps,responses,reading_time,paid,content,cleaned_content,cleaned_author
0,2021-04-30,Better Software Writing Skills in Data Science...,Pascal Potvin,https://towardsdatascience.com/better-software...,1,0,5,0,Pascal Potvin Apr 30·5 min read This week’s me...,this weeks mentoring and post is based on the...,pascal potvin
1,2021-04-30,Forecasting at scale with Facebook Prophet — a...,Konrad Hoppe,https://towardsdatascience.com/forecasting-at-...,12,0,5,0,Konrad Hoppe Apr 30·5 min read Time series for...,time series forecasting is usually a complex ...,konrad hoppe
2,2021-04-30,Five Subtle Pitfalls 99% Of Junior Python Deve...,Mohammed Ayar,https://towardsdatascience.com/five-subtle-pit...,366,5,6,1,Mohammed Ayar Apr 30·6 min read If data scienc...,if data science is the sexiest job of the 21s...,mohammed ayar
3,2021-04-30,Four Deep Learning Papers to Read in May 2021,Robert Lange,https://towardsdatascience.com/four-deep-learn...,371,2,6,1,Robert Lange Apr 30·6 min read Welcome to the ...,welcome to the end of april edition of the ma...,robert lange
4,2021-04-30,Lessons from the First Two Data Scientists at ...,Matt Sosna,https://towardsdatascience.com/lessons-from-th...,140,0,10,1,Matt Sosna Apr 30·10 min read Data science in ...,data science in startups is notorious for bei...,matt sosna
...,...,...,...,...,...,...,...,...,...,...,...
42655,2014-11-25,Lessons Learned From The Higgs Boson Kaggle Ch...,John Wittenauer,https://towardsdatascience.com/lessons-learned...,6,0,6,0,"John Wittenauer Nov 25, 2014·6 min read This c...",this content originally appeared on curious i...,john wittenauer
42656,2014-11-11,What makes a good data scientist-engineer?,Nikhil Dandekar,https://towardsdatascience.com/what-makes-a-go...,166,0,3,0,"Nikhil Dandekar Nov 11, 2014·3 min read The te...",the term data scientist has been used lately ...,nikhil dandekar
42657,2013-10-10,Avoiding an Epic Data Visualization Fail,Tricia Aanderud,https://towardsdatascience.com/avoiding-an-epi...,180,1,5,1,"Tricia Aanderud Oct 10, 2013·5 min read Effect...",effective data visualizations have some commo...,tricia aanderud
42658,2011-01-16,Ockham’s Spatula,Charlie Kufs,https://towardsdatascience.com/ockhams-spatula...,5,0,7,1,"Charlie Kufs Jan 16, 2011·7 min read Model bui...",model building is like climbing a mountain it...,charlie kufs


### check articles dataframe after cleaning

In [21]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42220 entries, 0 to 42659
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publish_date     42220 non-null  object
 1   title            42220 non-null  object
 2   author           42220 non-null  object
 3   url              42220 non-null  object
 4   claps            42220 non-null  int64 
 5   responses        42220 non-null  int64 
 6   reading_time     42220 non-null  int64 
 7   paid             42220 non-null  int64 
 8   content          42220 non-null  object
 9   cleaned_content  42220 non-null  object
 10  cleaned_author   42220 non-null  object
dtypes: int64(4), object(7)
memory usage: 3.9+ MB


## Clean data for Data Science jobs dataframe

In [22]:
# drop column index 0 that contains index
jobs_listing_df.drop(jobs_listing_df.columns[0], axis=1, inplace=True)

In [23]:
jobs_listing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3909 entries, 0 to 3908
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          3909 non-null   object 
 1   Salary Estimate    3909 non-null   object 
 2   Job Description    3909 non-null   object 
 3   Rating             3909 non-null   float64
 4   Company Name       3909 non-null   object 
 5   Location           3909 non-null   object 
 6   Headquarters       3909 non-null   object 
 7   Size               3909 non-null   object 
 8   Founded            3909 non-null   int64  
 9   Type of ownership  3909 non-null   object 
 10  Industry           3909 non-null   object 
 11  Sector             3909 non-null   object 
 12  Revenue            3909 non-null   object 
 13  Competitors        3909 non-null   object 
 14  Easy Apply         3909 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 488.6+ KB


In [24]:
# rename columns
jobs_listing_df.columns = ['job_title', 'salary_estimate', 'job_description', 'rating', 'company_name', 
                           'location', 'headquarters', 'size', 'founded', 'type_of_ownership', 'industry', 'sector',
                           'revenue', 'competitors', 'easy_apply']

In [25]:
# make all words in job description to be lowercase
jobs_listing_df['cleaned_job_description'] = jobs_listing_df['job_description'].str.lower()

In [26]:
# make all words in company name to be lowercase
jobs_listing_df['cleaned_company_name'] = jobs_listing_df['company_name'].str.lower()

In [27]:
# remove punctuations in job description
jobs_listing_df["cleaned_job_description"] = jobs_listing_df["cleaned_job_description"].apply(remove_punctuations)

In [28]:
# remove escape and special characters in job description
jobs_listing_df["cleaned_job_description"] = jobs_listing_df["cleaned_job_description"].apply(remove_escape_special_chars)

In [29]:
# remove emojis in job description 
jobs_listing_df["cleaned_job_description"] = remove_emojis(jobs_listing_df["cleaned_job_description"])

In [30]:
# extract company name before \n appears
jobs_listing_df["cleaned_company_name"] = jobs_listing_df["cleaned_company_name"].str.split('\n').str[0]

In [31]:
# remove company names inside job description
jobs_listing_df["cleaned_job_description"] = remove_col_content(jobs_listing_df, 'cleaned_job_description', 'cleaned_company_name')

In [32]:
# remove duplicated job description
duplicated_job = jobs_listing_df[jobs_listing_df.duplicated(["cleaned_job_description"])]
jobs_listing_df.drop(duplicated_job.index, inplace=True)
jobs_listing_df

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,easy_apply,cleaned_job_description,cleaned_company_name
0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ...",3.5,Hopper\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1,about at were on a mission to make booking tr...,hopper
1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1,at noom we use scientifically proven methods t...,noom us
2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,True,decodemhttpswwwdecodemcomdata science manager ...,decode_m
3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1,seeks a dynamic and driven midlevel data anal...,sapphire digital
4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription...",3.4,United Entertainment Group\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1,director data science 200537descriptionedelma...,united entertainment group
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3903,Cyber Security Data Engineer,$55K-$112K (Glassdoor est.),ApTask is a leading staffing and recruitment c...,3.9,ApTask\n3.9,"Dublin, OH","Iselin, NJ",201 to 500 employees,2010,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),"Collabera, Mitchell Martin, The Judge Group",-1,is a leading staffing and recruitment company...,aptask
3904,AWS Data Engineer,$55K-$112K (Glassdoor est.),About Us\n\nTachyon Technologies is a Digital ...,4.4,Tachyon Technologies\n4.4,"Dublin, OH","Irving, TX",201 to 500 employees,2011,Company - Private,IT Services,Information Technology,$10 to $25 million (USD),-1,-1,about us is a digital transformation consultin...,tachyon technologies
3906,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),Job DescriptionThe Security Analytics Data Eng...,3.8,"PDS Tech, Inc.\n3.8","Dublin, OH","Irving, TX",5001 to 10000 employees,1977,Company - Private,Staffing & Outsourcing,Business Services,$100 to $500 million (USD),-1,-1,job descriptionthe security analytics data eng...,"pds tech, inc."
3907,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),The Security Analytics Data Engineer will inte...,4.0,Data Resource Technologies\n4.0,"Dublin, OH","Omaha, NE",1 to 50 employees,-1,Company - Private,Accounting,Accounting & Legal,Less than $1 million (USD),-1,-1,the security analytics data engineer will inte...,data resource technologies


### check jobs listing dataframe after cleaning

In [33]:
jobs_listing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3690 entries, 0 to 3908
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   job_title                3690 non-null   object 
 1   salary_estimate          3690 non-null   object 
 2   job_description          3690 non-null   object 
 3   rating                   3690 non-null   float64
 4   company_name             3690 non-null   object 
 5   location                 3690 non-null   object 
 6   headquarters             3690 non-null   object 
 7   size                     3690 non-null   object 
 8   founded                  3690 non-null   int64  
 9   type_of_ownership        3690 non-null   object 
 10  industry                 3690 non-null   object 
 11  sector                   3690 non-null   object 
 12  revenue                  3690 non-null   object 
 13  competitors              3690 non-null   object 
 14  easy_apply              

## Export both dataframes to csv file (without index column)

In [34]:
articles_df.to_csv('cleaned_articles.csv', index=False)

In [35]:
jobs_listing_df.to_csv('cleaned_jobs_listings.csv', index=False)