# Use-case: Salary Prediction
## We need to create a model that can predict the salary of the employee based on EMPLOYEE's YearsOfExperience

In [4]:
import numpy as np
import pandas as pd

In [5]:
salaryDataDF = pd.read_csv("Salary_Data.csv")

In [6]:
salaryDataDF.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [8]:
salaryDataDF.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [9]:
salaryDataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 624.0 bytes


In [10]:
salaryDataDF.isnull().sum()

YearsExperience    1
Salary             1
dtype: int64

In [14]:
# imputation process is only for features, but not for the Labels..
# as we do not know which answer need to fill for the given question..
# Machine as to learn the amswers from the questions, so if we put wrong answers for the questions 
# then machine also learns wrong answers then model will be perfect model for the future predictions
# if data in Label is missing , then better to remove.
salaryDataDF.dropna(inplace=True)

In [15]:
salaryDataDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 720.0 bytes


In [16]:
salaryDataDF.isnull().sum()

YearsExperience    0
Salary             0
dtype: int64

In [18]:
salaryDataDF.describe()

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


In [10]:
#Rules (Not guide lines) on SKlearn For Regression
# 1. Features and Label must be in the form of NUMPY ARRAY
# 2. Features must be in 2D array
# 3. Label  must be in 2D array

In [11]:
# Assuming we completed all Inappropriate, Outlier, Data Preprocessing Steps
# the Next STep is:

In [19]:
#Seperate data as features and label
featuresDF = salaryDataDF.iloc[:,[0]].values
labelDF = salaryDataDF.iloc[:,[1]].values

In [7]:
featuresDF

NameError: name 'featuresDF' is not defined

In [2]:
labelDF

NameError: name 'labelDF' is not defined

ML Coding Begins

In [16]:
# Before you initiate the coding, you must know two things from your data scientist
# 1. Approved Significance Level for the Project
# 2. Timeline to Develop and Deploy the Model

Process:

In [16]:
# Step 1. Create Train Test Split
# Step 2. Build the Model
# Step 3. Check the Quality of Model w.r.t. CL
# Step 4. If CL is satisified, Perform DEPLOYMENT !!!!, else. goto Step2 with some CHANGES!!!

In [None]:
# Machine Learning model build is an iterative process, it will be broken down based on the CL value.

### 1. Create Train Test Split
#### Sk learn works on numpy library

In [23]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(featuresDF,
                                                 labelDF,
                                                 test_size=0.2,
                                                 random_state=10)

In [24]:
# 2. Build the Model

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train) # Training my Model with Training Data, 
                           # .fit() function helps to calculate the best values of coefficients & intercept
                           # No. of coefiicients depends on no. of features, intercept is always 1.
                           # We will not do transform here while building model 
                           # because,Learning coefficients and intercept

LinearRegression()

# 3. Check the Quality of Model w.r.t. CL

In [34]:
# 1. We need to check whether the trained model is a generalized model or not
# 2. Generalization is all about how the model performs with UNKNOWN DATA
# 3. Given Model --> Underfitted Model, Generalized Model , Overfitted Model
# 4. SL ----> 0.02 for this problem
# 5. test_score > train_score and test_score >= CL ----> Model is Generalized
# 6. train score > test_score -------------------------> Model is Overfitted (acc > 20 or acc_train > CL)

In [25]:
#Training Score (Accuracy)
model.score(X_train,y_train) #0-1

0.9494673013344644

In [26]:
#Testing Score
model.score(X_test,y_test)

0.9816423482070253

In [28]:
# check for quality
# 1. test_score > train_score then 
# 2. error<=CL , CL=1-SL , 92% 
# This is Generalised model.

In [29]:
#Coef
model.coef_

array([[9356.86299354]])

In [30]:
#Intercept
model.intercept_

array([26089.09663242])

In [31]:
#Equation of Line?
# salary = 26089.09663242 + (9356.86299354 * YearsExperience)
# what is the salry of the freshers....exp=0

# Deployment of Model for Prediction 
### (Production Environment at user place)
#### 1. Pickle is a package (library) which will convert the memory object into file.
#### 2. model is a memort object, that we need to persist the model,
#### 3. model objects are volatile in nature..it means vanish , we want to persist this object.
#### 4. To persist this object we need to store in the form of file.
#### 5. pickle will help to do this.

# file handling in python

In [33]:
#open() function in python allows you to create a new file
#Syntax is : open(filename,writeFormat/readFormat)
#wb - write the bytes in the file
#rb - read the bytes from the file
# pkl,mdl,tf2,hdf,custom
import pickle
pickle.dump(model , open('SalaryPredictor.model' , 'wb'))

In [34]:
# we can use this model in the software
# file format may vary from one user to another, .pkl also can be used
# In market, presently people are usning .model
# standard formats of the model is pkl,mdl,tf2,hdf,custom
# .model is the part of custom format