## Multiple Linear Regression Basics

#### 1. Import Dependancies

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

#### 2. Load Dataset

In [2]:
df = pd.read_csv('./data/hiring.csv')

In [3]:
df.head()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [4]:
df.shape

(8, 4)

#### 3. Data Cleaning

In [5]:
#Rename columns
df = df.rename(columns={'test_score(out of 10)':'test_score','interview_score(out of 10)':'interview_score','salary($)':'salary'})

In [6]:
df.head(2)

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000


In [7]:
#Check for null values
df.columns[df.isnull().any()]

Index(['experience', 'test_score'], dtype='object')

In [8]:
#Impute null values for experience
df['experience'].unique()

array([nan, 'five', 'two', 'seven', 'three', 'ten', 'eleven'],
      dtype=object)

In [9]:
freq_exp = df['experience'].value_counts().idxmax()
freq_exp

'five'

In [10]:
df.experience = df.experience.fillna(freq_exp)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,five,8.0,9,50000
1,five,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [11]:
df['experience'].unique()

array(['five', 'two', 'seven', 'three', 'ten', 'eleven'], dtype=object)

In [12]:
mapping = {'five':5, 'two':2, 'seven':7, 'three':3, 'ten':10, 'eleven':11}

In [13]:
df['experience'] = df['experience'].map(mapping)

In [14]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,5,8.0,9,50000
1,5,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [15]:
#Impute null values for 
ts_median = df.test_score.median()

In [16]:
df.test_score = df.test_score.fillna(ts_median)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,5,8.0,9,50000
1,5,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,8.0,7,72000
7,11,7.0,8,80000


#### 4. Fit Regression Model

In [17]:
reg = linear_model.LinearRegression()
reg.fit(df[['experience','test_score','interview_score']],df.salary)

In [18]:
reg.coef_

array([3221.39134934, 1617.86554643, 3176.24086827])

In [19]:
reg.intercept_

5918.063888238721

#### 5. Predictions

In [20]:
input = [6,9,8]
input = np.array(input).reshape(1,-1)
input

array([[6, 9, 8]])

In [21]:
reg.predict(input)



array([65217.12884826])

#### 6. Saving Model

In [27]:
import os
import pickle

In [28]:
if not os.path.exists('models'):
    os.makedirs('models')

In [29]:
with open('./models/regmodel', 'wb') as f:
    pickle.dump(reg, f)

In [30]:
with open('./models/regmodel', 'rb') as f:
    model = pickle.load(f)

In [31]:
model.predict(input)



array([65217.12884826])