In [None]:
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.rc('font', family='NanumBarunGothic') 
mpl.rc('axes', unicode_minus=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
job_clean = pd.read_csv('/content/drive/My Drive/data_job_group.csv')
job_clean

In [None]:
job_clean.describe(include="all")

In [None]:
# 설립시기 분포 및 평균 연도
sns.displot(job_clean["Founded"]);
plt.gcf().set_size_inches(15, 8);

print("설립시기 평균 연도 :", round(job_clean["Founded"].mean()),"년")
plt.axvline(job_clean["Founded"].mean(), color='red');

In [None]:
# 평균 연봉 분포 및 평균 연봉
sns.displot(job_clean["Average_Salary"])
plt.gcf().set_size_inches(15, 8)

print("평균 연봉($) :", round(job_clean["Average_Salary"].mean()))
plt.axvline(job_clean["Average_Salary"].mean(), color='red'); 

In [None]:
# 사업부문 상위 10개 부문 분포
job_clean["Sector"].value_counts().head(10).plot.barh();
plt.xticks(fontsize=15)
plt.yticks(fontsize=15);
plt.gca().invert_yaxis()

In [None]:
# 업종 상위 10개 부문 분포
job_clean["Industry"].value_counts().head(10).plot.barh();
plt.xticks(fontsize=15)
plt.yticks(fontsize=15);
plt.gca().invert_yaxis()

In [None]:
# 회사 유형 상위 10개 부문
job_clean["Type of ownership"].value_counts().head(10).plot.barh();
plt.xticks(fontsize=15)
plt.yticks(fontsize=15);
plt.gca().invert_yaxis()

In [None]:
# 본사 & 현 근무지 일치에 대한 비율
LH = job_clean["Location=Headquarters"].value_counts()
value = LH.unique()

plt.rcParams['figure.figsize'] = [12, 8] 
plt.pie(value, labels=["본사 ≠ 근무지", "본사 = 근무지"], autopct="%1.2f%%", textprops={'fontsize': 20});

In [None]:
# 본사 근무자, 지사 근무자의 평점 현황
sns.barplot(data=job_clean, x="Location=Headquarters", y="Rating", ci=None);

In [None]:
# 평균 연봉과 평점의 관계
sns.barplot(data=job_clean, x="Rating", y="Average_Salary", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axhline(job_clean["Average_Salary"].mean(), color='red'); 

In [None]:
# 직종별 평점
sns.barplot(data=job_clean, x="division", y="Rating", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axhline(job_clean["Rating"].mean(), color='red');

In [None]:
# 회사유형별 평균 평점
job_own = pd.pivot_table(job_clean, index=["Type of ownership"], aggfunc="mean").reset_index()
job_own = job_own.sort_values(by="Rating", axis=0, ascending=False).round(1)
job_own.index = range(len(job_own))

count_own = pd.DataFrame(data=job_clean["Type of ownership"].value_counts(), columns=["Type of ownership","cnt"]).reset_index()
count_own = count_own.rename(columns={"index":"Type of ownership", "Type of ownership":"count"})
count_own = count_own[["Type of ownership", "count"]]

job_own = pd.merge(job_own, count_own, on="Type of ownership")
job_own["count%"] = ""

for i, j in enumerate(job_own["count"]):
  job_own["count%"][i] = round(j/sum(job_own["count"])*100, 2)

job_own = job_own[["Type of ownership", "Rating", "Average_Salary", "Founded", "Revenue", "Size", "count%"]]

sns.barplot(data=job_own, x="Rating", y="Type of ownership", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axvline(job_clean["Rating"].mean(), color='red');

job_own

In [None]:
# 산업별 평균 평점 상위 20개
job_ind = pd.pivot_table(job_clean, index=["Industry"], aggfunc="mean").reset_index()
job_ind = job_ind.sort_values(by="Rating", axis=0, ascending=False).round(1)
job_ind.index = range(len(job_ind))

count_ind = pd.DataFrame(data=job_clean["Industry"].value_counts(), columns=["Industry","cnt"]).reset_index()
count_ind = count_ind.rename(columns={"index":"Industry", "Industry":"count"})
count_ind = count_ind[["Industry", "count"]]

job_ind = pd.merge(job_ind, count_ind, on="Industry")
job_ind["count%"] = ""

for i, j in enumerate(job_ind["count"]):
  job_ind["count%"][i] = round(j/sum(job_ind["count"])*100, 2)

job_ind = job_ind[["Industry", "Rating", "Average_Salary", "Founded", "Revenue", "Size", "count%"]]

sns.barplot(data=job_ind[:20], x="Rating", y="Industry", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axvline(job_clean["Rating"].mean(), color='red');

job_ind[:20]

In [None]:
# 사업부문 별 평균 평점
job_sec = pd.pivot_table(job_clean, index=["Sector"], aggfunc="mean").reset_index()
job_sec = job_sec.sort_values(by="Rating", axis=0, ascending=False).round(1)
job_sec.index = range(len(job_sec))

count_sec = pd.DataFrame(data=job_clean["Sector"].value_counts(), columns=["Sector","cnt"]).reset_index()
count_sec = count_sec.rename(columns={"index":"Sector", "Sector":"count"})
count_sec = count_sec[["Sector", "count"]]

job_sec = pd.merge(job_sec, count_sec, on="Sector")
job_sec["count%"] = ""

for i, j in enumerate(job_sec["count"]):
  job_sec["count%"][i] = round(j/sum(job_sec["count"])*100, 2)

job_sec = job_sec[["Sector", "Rating", "Average_Salary", "Founded", "Revenue", "Size", "count%"]]

sns.barplot(data=job_sec, x="Rating", y="Sector", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axvline(job_clean["Rating"].mean(), color='red');

job_sec

In [None]:
# 직원규모 별 평균 평점
job_size = pd.pivot_table(job_clean, index=["Size"], aggfunc="mean").reset_index()
job_size = job_size.sort_values(by="Rating", axis=0, ascending=False).round(1)
job_size.index = range(len(job_size))

count_size = pd.DataFrame(data=job_clean["Size"].value_counts(), columns=["Size","cnt"]).reset_index()
count_size = count_size.rename(columns={"index":"Size", "Size":"count"})
count_size = count_size[["Size", "count"]]

job_size = pd.merge(job_size, count_size, on="Size")
job_size["count%"] = ""

for i, j in enumerate(job_size["count"]):
  job_size["count%"][i] = round(j/sum(job_size["count"])*100, 2)

job_size = job_size[["Size", "Rating", "Average_Salary", "Founded", "Revenue", "count%"]]

sns.barplot(data=job_size, x="Size", y="Rating", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axhline(job_clean["Rating"].mean(), color='red');

job_size

In [None]:
# 매출 별 평균 평점
job_rev = pd.pivot_table(job_clean, index=["Revenue"], aggfunc="mean").reset_index()
job_rev = job_rev.sort_values(by="Rating", axis=0, ascending=False).round(1)
job_rev.index = range(len(job_rev))

count_rev = pd.DataFrame(data=job_clean["Revenue"].value_counts(), columns=["Revenue","cnt"]).reset_index()
count_rev = count_rev.rename(columns={"index":"Revenue", "Revenue":"count"})
count_rev = count_rev[["Revenue", "count"]]

job_rev = pd.merge(job_rev, count_rev, on="Revenue")
job_rev["count%"] = ""

for i, j in enumerate(job_rev["count"]):
  job_rev["count%"][i] = round(j/sum(job_rev["count"])*100, 2)

job_rev = job_rev[["Revenue", "Rating", "Average_Salary", "Founded", "Size", "count%"]]

sns.barplot(data=job_rev, x="Revenue", y="Rating", ci=None)
plt.gcf().set_size_inches(15, 8)
plt.axhline(job_clean["Rating"].mean(), color='red');

job_rev