In [None]:
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.size'] = 12

In [None]:
url = 'https://www.shine.com/job-search/jobs?job_type=2&top_companies_boost=true&sort=1'
response = requests.get(url)
response

In [None]:
soup = BeautifulSoup(response.text,'html.parser')
html = soup.find_all('div')

In [None]:
req = soup.select('div h2[itemprop="name"]')
titles = [r.text for r in req]
titles[:5]

In [None]:
titles1 = [t.replace("|","") for t in titles]
titles = [t.replace("  ", "") for t in titles1]
titles[:5]

In [None]:
orgs = soup.find_all('div', class_='jobCard_jobCard_cName__mYnow')
orgs1 = [o.text for o in orgs]
[o for o in orgs1][:5]

In [None]:
sub_str = "Hiring"
[o.split(sub_str)[0] for o in orgs1][:5]

In [None]:
companies = [o.split(sub_str)[0] for o in orgs1]
companies[:5]

In [None]:
loc = soup.find_all('div', class_='jobCard_jobCard_lists__fdnsc')

In [None]:
[l.text[10:] for l in loc]

In [None]:
loc1 = [l.text[10:] for l in loc]

In [None]:
[l.replace("+", ",") for l in loc1][:5]

In [None]:
loc2 = [l.replace("+", ",") for l in loc1]

In [None]:
pattern  = r'[0-9]'
strpattern = r'[a-z]'

In [None]:
[re.sub(pattern, '', l) for l in loc2][:5]

In [None]:
location = [re.sub(pattern, '', l) for l in loc2]

In [None]:
location[:5]

In [None]:
[l.text[:10] for l in loc][:5]

In [None]:
experience = [l.text[:10] for l in loc]

In [None]:
experience[:5]

In [None]:
vacancies = soup.find_all('ul', class_='jobCard_jobCard_jobDetail__jD82J')

In [None]:
[v.text.split("Positions")[0] for v in vacancies ][:10]

In [None]:
[v.text.split("Positions")[0][-3:-1] for v in vacancies ][:10]

In [None]:
vac = [v.text.split("Positions")[0][-3:] for v in vacancies ]
vac = [v.replace('la', '1') for v in vac]
vac = [re.sub(strpattern, '', l) for l in vac]

In [None]:
vacancies= [v.replace(' ','') for v in vac]

In [None]:
vacancies[:5]

In [None]:
df = pd.DataFrame({'Titles':titles, 'Firm Name': companies, 'Job Location':location, 'Experience':experience,
                   'Positions': vacancies})

In [None]:
df.head()

In [None]:
df['Titles'].duplicated().sum()

In [None]:
df = df.drop_duplicates(subset=['Titles'])

In [None]:
df.head()

In [None]:

df['Positions'] = df['Positions'].astype('int32')

numpattern = r'[0]'
df['Category'] = ['Fresher' if '0' in i else 'Experienced' for i in df['Experience']]

In [None]:
df.sort_values(by='Positions', ascending=False)

In [None]:
TITLES = []
COMPANIES = []
LOCATIONS = []
EXPERIENCE = []
VACANCIES = []

for i in range(1,6):
    link = f'https://www.shine.com/job-search/jobs-{i}?job_type=2&top_companies_boost=true&sort=1'
    response = requests.get(link)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        req = soup.select('div h2[itemprop="name"]')
        titles = [r.text for r in req]
        titles1 = [t.replace("|","") for t in titles]
        titles = [t.replace("  ", "") for t in titles1]
        TITLES.extend(titles)
        
        orgs = soup.find_all('div', class_='jobCard_jobCard_cName__mYnow')
        orgs1 = [o.text for o in orgs]
        sub_str = "Hiring"
        companies = [o.split(sub_str)[0] for o in orgs1]
        COMPANIES.extend(companies) 
        
        loc = soup.find_all('div', class_='jobCard_jobCard_lists__fdnsc')
        loc1 = [l.text[10:] for l in loc]
        loc2 = [l.replace("+", ",") for l in loc1]
        pattern  = r'[0-9]'
        strpattern = r'[a-z]'
        location = [re.sub(pattern, '', l) for l in loc2]
        LOCATIONS.extend(location)
        
        experience = [l.text[:10] for l in loc]
        EXPERIENCE.extend(experience)  
        
        vacancies = soup.find_all('ul', class_='jobCard_jobCard_jobDetail__jD82J')
        vac = [v.text.split("Positions")[0][-3:] for v in vacancies ]
        vac = [v.replace('la', '1') for v in vac]
        vac = [re.sub(strpattern, '', l) for l in vac]
        vacancies= [v.replace(' ','') for v in vac]
        VACANCIES.extend(vacancies)
        
    else:
        print('Invalid Response')

df = pd.DataFrame({'Job Title': TITLES, 
                   'Employer': COMPANIES,
                   'Job Location': LOCATIONS, 
                   'Experience': EXPERIENCE, 
                   'Positions': VACANCIES})

In [None]:
df.head()

In [None]:
df = df.drop_duplicates(subset=['Job Title'])

In [None]:
df.Positions = pd.to_numeric(df.Positions)

In [None]:
plt.figure(figsize=(22,5), dpi=100)
ax = sns.barplot(x=df['Employer'][:10], y=df['Positions'][:10])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Firm Wise vacancy count', fontsize=20)
plt.axhline(df.Positions.mean(), color='red')

#Annotating each bar
for bar in ax.patches:
    ax.annotate(format(bar.get_height(), '.0f'),(bar.get_x() + bar.get_width() / 2,bar.get_height()), 
                 ha='center', va='center',size=15, xytext=(0, 8),textcoords='offset points')

#coloring each bar based on a condition if the bar value is greator or less than count of positions for a given firm 
for i in range(len(ax.patches)):
    #getting the count of each bar
    counts = [p.get_height() for p in ax.patches]
    #getting the colors based on the condition for each bar
    colors = ['#38A7D0' if j > df.Positions.mean() else '#F67088' for j in counts]
    #passing the color for each bar
    ax.patches[i].set_facecolor(colors[i])  
    
plt.xticks(rotation=45)
plt.show()

In [None]:
palette = ['#38A7D0', '#F67088']
plt.figure(figsize=(22,5), dpi=100)
ax = sns.countplot(data =df,x=df['Experience'][:10], hue=df['Experience'][:10], dodge=False, palette=palette)
for bar in ax.patches:
    ax.annotate(format(bar.get_height(), '.0f'),(bar.get_x() + bar.get_width() / 2,bar.get_height()), 
                 ha='center', va='center',size=15, xytext=(0, 8),textcoords='offset points')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Category Count Plot', fontsize=20)
plt.show()