# Imports

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import missingno
from sklearn.impute import KNNImputer

# Pre-processing

In [None]:
df = pd.read_csv("DataEngineer.csv")
df.head()

In [None]:
df.columns

In [None]:
df.replace('-1', np.nan, inplace=True)
df.replace(-1, np.nan, inplace=True)
df.replace('Unknown', np.nan, inplace=True)

In [None]:
df.isnull().mean()

In [None]:
df['Type of ownership'].replace(regex='Company -', value='', inplace=True)
df.replace(regex='\\r', value='', inplace=True)
df.replace(regex='\\n', value=' ', inplace=True)
df

In [None]:
list_str_obj_cols = df.columns[df.dtypes == "object"].tolist()
for str_obj_col in list_str_obj_cols:
    df[str_obj_col] = df[str_obj_col].astype("category")

In [None]:
df.dtypes

In [None]:
df.duplicated(keep=False).sum()


In [None]:
missingno.bar(df.iloc[:, :], color="green", figsize=(10, 5), fontsize=12)
plt.title('Non nan values')


In [None]:
missingno.matrix(df.iloc[:, :], figsize=(
    10, 5), fontsize=12, sparkline=False)

plt.title('Nan matrix')

In [None]:
df['Salary Estimate mean'] = df['Salary Estimate']
df['Salary Estimate mean'].replace(regex='\(Glassdoor est.\)', value='', inplace=True)
df['Salary Estimate mean'].replace(regex='\(Employer est.\)', value='', inplace=True)
df['Salary Estimate mean'].replace(regex='K', value='000', inplace=True)
df['Salary Estimate mean'].replace(regex='\$', value='', inplace=True)
df['Salary Estimate mean'] = df['Salary Estimate mean'].str.split('-', expand=True).astype(int).mean(axis=1)
px.box(df, x='Salary Estimate mean', title='Boxplot of Salary Estimate\'s mean')


In [None]:
df.drop(columns=['Job Description', 'Company Name', 'Competitors', 'Easy Apply', 'Competitors'],inplace=True)

In [None]:
missingno.bar(df.iloc[:, :], color="green", figsize=(10, 5), fontsize=12)
plt.title('Nan values')


In [None]:
missingno.matrix(df.iloc[:, :], figsize=(
    10, 5), fontsize=12, sparkline=False)

plt.title('Nan matrix')

In [None]:
knn_imp = KNNImputer(n_neighbors=3)
# fit and transform the imputer on the dataset
df_knn = pd.DataFrame(knn_imp.fit_transform(df), columns=['Rating', 'Founded', 'Salary Estimate'])


In [None]:
df_knn.isnull().sum()

In [None]:
df['Rating'].plot.kde(color='r')
df_knn["Rating"].plot.kde(color='y')


In [None]:
df['Size'].plot.kde(color='r')
df_knn["Size"].plot.kde(color='y')


In [None]:
df['Player_Score_3'].plot.kde(color='r')
df_knn["Player_Score_3"].plot.kde(color='y')


In [None]:
df['Player_Score_3'].plot.kde(color='r')
df_knn["Player_Score_3"].plot.kde(color='y')


# Visualizations

In [None]:
industry = df.groupby(['Industry']).size().reset_index(name = 'Count').sort_values('Count', ascending=False)

px.bar(industry, x='Industry', y='Count')
