# Salary Predictions: Exploratory Data Analysis (EDA)

## Import Python Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install Seaborn

In [None]:
import numpy as np
from numpy.random import randn
import pandas as pd

from pandas import Series, DataFrame
import matplotlib.pyplot as plt

from matplotlib import rcParams

import seaborn as sb

## Load Data

### Read the data file through my Desktop

In [None]:
data1 = pd.read_csv('data/test_features.csv')
data1

In [None]:
data2 = pd.read_csv('data/train_features.csv')
data2

In [None]:
data3 = pd.read_csv('data/train_salaries.csv')
data3

## Examine Data

In [None]:
# Check how many rows we have in data1 and data2 using len() function
len(data1)
len(data2)

In [None]:
# Check the columns of data1 and data2
data1.columns
data2.columns

In [None]:
# Check how many columns we have in data1 and data2 using len() function
len(data1.columns)
len(data2.columns)

In [None]:
# Check the first top 10 rows of data1 using .head() method
data1.head(10)

In [None]:
# Check the bottom 10 rows of data2 using .tail() method
data2.tail(10)

In [None]:
# Check the first top 10 rows of data3 using .head() method
data3.head(10)

In [None]:
# Using .dtypes() to check data types
print(data1.dtypes)
print(data2.dtypes)
print(data3.dtypes)

In [None]:
# Using .info() to provide a concise summary of Dataset
data1.info()
data2.info()
data3.info()

In [None]:
# Using .describe(include = 'all') to provide full summary statistics
data1.describe(include = 'all')
data2.describe(include = 'all')
data3.describe(include = 'all')

In [None]:
# Using .transpose() to transpose the statistics we just got from data2 and data3
data2.describe(include = 'all').transpose()
data3.describe(include = 'all').transpose()

## Data Formatting

In [None]:
# Correcting data types (Convert data type to integer in column "salary")
data3['salary'].astype('int')

## Descriptive Statistical Analysis: Using "value counts" to count each feature

### companyId as variable

In [None]:
# Count the variable
data2['companyId'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['companyId'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
companyId_counts = data2['companyId'].value_counts().to_frame()
companyId_counts.rename(columns = {'companyId': 'value_counts'}, inplace = True)
companyId_counts.index.name = 'companyId'
companyId_counts

### jobType as variable

In [None]:
# Count the variable
data2['jobType'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['jobType'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['jobType'].value_counts().to_frame()
jobType_counts.rename(columns = {'jobType': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'jobType'
jobType_counts.head(10)

### degree as variable

In [None]:
# Count the variable
data2['degree'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['degree'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['degree'].value_counts().to_frame()
jobType_counts.rename(columns = {'degree': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'degree'
jobType_counts.tail(10)

### major as variable

In [None]:
# Count the variable
data2['major'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['major'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['major'].value_counts().to_frame()
jobType_counts.rename(columns = {'major': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'major'
jobType_counts.tail(10)

### industry as variable

In [None]:
# Count the variable
data2['industry'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['industry'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['industry'].value_counts().to_frame()
jobType_counts.rename(columns = {'industy': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'industry'
jobType_counts.head(10)

### yearsExperience as variable

In [None]:
# Count the variable
data2['yearsExperience'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['yearsExperience'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['yearsExperience'].value_counts().to_frame()
jobType_counts.rename(columns = {'yearsExperience': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'yearsExperience'
jobType_counts.head(10)

### milesFromMetropolis as variable

In [None]:
# Count the variable
data2['milesFromMetropolis'].value_counts()

In [None]:
# Convert the series to a Dataframe
data2['milesFromMetropolis'].value_counts().to_frame()

In [None]:
# Save the results to the dataframe and Rename the index
jobType_counts = data2['milesFromMetropolis'].value_counts().to_frame()
jobType_counts.rename(columns = {'milesFromMetropolis': 'value_counts'}, inplace = True)
jobType_counts.index.name = 'milesFromMetropolis'
jobType_counts.head(10)

## Let's work with some potential outliers

In [None]:
# Let's filter or extract VICE_PRESIDENT columns from jobType and recall data2
data2.jobType == 'VICE_PRESIDENT'
Filter1 = data2.jobType == 'VICE_PRESIDENT'
data2[Filter1]

In [None]:
# Let's filter or extract DOCTORAL columns from degree and recall data2
data2.degree == 'DOCTORAL'
Filter2 = data2.degree == 'DOCTORAL'
data2[Filter2]

In [None]:
# Let's filter or extract yearExperience columns and recall data2
data2.yearsExperience > 15
Filter3 = data2.yearsExperience > 15
data2[Filter3]

In [None]:
# To combine Filter2 and Filter3 to get a data with ONLY DOCTORAL degree which have more than 15 years of experience
Filter2 & Filter3
data2[Filter2 & Filter3]

In [None]:
#Let's filter or extract salary columns and recall data3
data3['salary'] >= 150
Filter4 = data3['salary'] >= 150
data3[Filter4]

In [None]:
# Print out data2 with major MATH only
data2[data2.major == 'MATH']

In [None]:
# Print out data3 with salary less than or equal to 200 only
data3[data3.salary <= 200]

## Visualization

In [None]:
## Distribution of salary
sns.set_style('darkgrid')
Vis1 = sns.distplot(data3['salary'], bins = 20)
rcParams['figure.figsize'] = 30,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## Histogram of salary
sns.set_style('dark')
plt.hist(data3.salary, bins = 20, color = 'Green')
rcParams['figure.figsize'] = 30,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

# By Histogram, let's show the first 10th salary and add a legend
sns.set_style('dark')

plt.hist(data3.salary[0], color = 'Black', label = data3.jobId[0])
plt.hist(data3.salary[1], color = 'Red', label = data3.jobId[1])
plt.hist(data3.salary[2], color = 'Green', label = data3.jobId[2])
plt.hist(data3.salary[3], color = 'Blue', label = data3.jobId[3])
plt.hist(data3.salary[4], color = 'Magenta', label = data3.jobId[4])
plt.hist(data3.salary[5], color = 'Gray', label = data3.jobId[5])
plt.hist(data3.salary[6], color = 'Yellow', label = data3.jobId[6])
plt.hist(data3.salary[7], color = 'Pink', label = data3.jobId[7])
plt.hist(data3.salary[8], color = 'Orange', label = data3.jobId[8])
plt.hist(data3.salary[9], color = 'Magenta', label = data3.jobId[9])

plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
rcParams['figure.figsize'] = 15,10
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('salary', fontsize = 20, color = 'Blue')
plt.ylabel('jobId', fontsize = 20, color = 'Green')
plt.title('First 10th Salaries Analysis', fontsize = 30, color = 'DarkBlue', fontname = 'DejaVu Sans')
plt.legend(loc = 'upper left', bbox_to_anchor = (1,1))

plt.show()

### Let's plot each feature

In [None]:
## companyId
sns.set_style('dark')
plt.hist(data2.companyId)
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## companyId with the salary
sns.set_style('darkgrid')
plt.hist(data2.companyId, color = 'Orange')
plt.hist(data3.salary, color = 'Purple')
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('companyId', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## jobType
sns.set_style('dark')
plt.hist(data2.jobType)
rcParams['figure.figsize'] = 15,8
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## jobType with the salary
sns.set_style('dark')
plt.hist(data2.jobType, color = 'Red')
plt.hist(data3.salary, color = 'Gray')
rcParams['figure.figsize'] = 15,8
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('jobType', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## degree
sns.set_style('darkgrid')
plt.hist(data2.degree)
rcParams['figure.figsize'] = 30,15
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## degree with the salary
sns.set_style('darkgrid')
plt.hist(data2.degree, color = 'Purple')
plt.hist(data3.salary, color = 'Blue')
rcParams['figure.figsize'] = 30,15
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('degree', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## major
sns.set_style('darkgrid')
plt.hist(data2.major)
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## major with the salary
sns.set_style('white')
plt.hist(data2.major, color = 'Purple')
plt.hist(data3.salary, color = 'Green')
rcParams['figure.figsize'] = 30,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('major', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## industry
sns.set_style('white')
plt.hist(data2.industry)
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## industry with the salary
sns.set_style('darkgrid')
plt.hist(data2.industry, color = 'Red')
plt.hist(data3.salary, color = 'Orange')
rcParams['figure.figsize'] = 15,10
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('industry', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## yearsExperience
sns.set_style('darkgrid')
plt.hist(data2.yearsExperience)
rcParams['figure.figsize'] = 15,8
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## yearsExperience with the salary
sns.set_style('darkgrid')
plt.hist(data2.yearsExperience, color = 'Purple')
plt.hist(data3.salary, color = 'Yellow')
rcParams['figure.figsize'] = 15,10
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('yearsExperience', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## milesFromMetropolis
sns.set_style('darkgrid')
plt.hist(data2.milesFromMetropolis)
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## milesFromMetropolis with the salary
sns.set_style('dark')
plt.hist(data2.milesFromMetropolis, color = 'Red')
plt.hist(data3.salary, color = 'Pink')
rcParams['figure.figsize'] = 15,10
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.xlabel('milesFromMetropolis', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
## Jointplots with degree and major
j = sns.jointplot(data = data2, x = 'degree', y = 'major')
rcParams['figure.figsize'] = 20,15
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

In [None]:
## Violinplots with degree and yearsExperience
v = sns.violinplot(data = data2, x = 'degree', y = 'yearsExperience')
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

In [None]:
## Boxplots with major and yearsExperience
w = sns.boxplot(data = data2, x = 'major', y = 'yearsExperience')
rcParams['figure.figsize'] = 20,15
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

## Correlation

In [None]:
from scipy import stats

### Let's calculate the Pearson Correlation Coefficient and P-value of some variables

In [None]:
# yearsExperience and salary
pearson_coef, p_value = stats.pearsonr(data2['yearsExperience'], data3['salary'])
print("The Pearson Correlation Coefficient is", pearson_coef, "with a P-value of P =", p_value)

In [None]:
# milesFromMetropolis and salary
pearson_coef, p_value = stats.pearsonr(data2['milesFromMetropolis'], data3['salary'])
print("The Pearson Correlation Coefficient is", pearson_coef, "with a P-value of P =", p_value)

## Model Development and Evaluation

### Simple Linear Regression with 'yearsExperience' as a variable

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Create the linear regression object
lm = LinearRegression()
lm

In [None]:
# Let's see how "yearsExperience" can help us predict salary, by creating a linear function with "yearsExperience" as the predictor variable and the "salary" as the target variable
X = data2[['yearsExperience']]
Y = data3['salary']

In [None]:
# Fit the linear model using yearsExperience
lm.fit(X,Y)

In [None]:
# Calculating the R^2 (R- squared) as follows:
lm.score(X,Y)

In [None]:
# Output a prediction
Y = lm.predict(X)
Y[0]

In [None]:
# Know the value of the intercept (b)
lm.intercept_

In [None]:
# Know the value of the Slope (a)
lm.coef_

In [None]:
## Final estimated linear model:
## X = Predictor Variable(yearsExperience)
## Y = Target Variable(salary)
## Y = aX + b

## Y = 2.01316292X + 91.91919121041059   ## For a Prediction
## Y = 2X + 91
## salary = 2*yearsExperience + 91

## Fitting a Simple Linear Model Estimator
## For X = 0        Y = 91
## For X = 5        Y = 101
## For X = 10       Y = 111
## For X = 15       Y = 121

### Model Evaluation using Visualization

In [None]:
# Regression Plot
sns.set_style('dark')
sns.regplot(data2['yearsExperience'], data3['salary'], color = 'purple')
plt.xlabel('yearsExperience', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
rcParams['figure.figsize'] = 12,8
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

In [None]:
# Residual Plot
sns.set_style('white')
sns.residplot(data2['yearsExperience'], data3['salary'], color = 'Orange')
plt.xlabel('yearsExperience', fontsize = 20, color = 'Blue')
plt.ylabel('salary', fontsize = 20, color = 'Green')
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
rcParams['figure.figsize'] = 10,5
x = np.arange(0, 5, 0.1)
y = np.sin(x)
plt.show()

### Calculating Ridge Regression (RR)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
RidgeModel = Ridge(alpha = 0.1)
RidgeModel.fit(X,Y)

In [None]:
Y = RidgeModel.predict(X)
Y