In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Salary = pd.read_csv("Salary_Data.csv") #Importation of data
Salary[:5]

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [90]:
#Split the data into dependent(y) and independent(x)

x = Salary.iloc[:, 0:-1].values
y = Salary.iloc[:, -1].values
x[:3]

array([[1.1],
       [1.3],
       [1.5]])

In [91]:
#Split into Train and Test data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 12)
x_test

array([[3.2],
       [7.9],
       [5.9],
       [4.5],
       [1.3],
       [7.1]])

In [92]:
#Standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)
x_test[:3]

array([[-0.78699876],
       [ 1.28714751],
       [ 0.40453207]])

### USING LINEAR REGRESSION

In [93]:
from sklearn.linear_model import LinearRegression
model = LinearRegression() #To create  an instance of the LinearRegression

# Training your model
model.fit(x_train, y_train)

# Making prediction on your Test data
y_predict = model.predict(x_test)
y_predict

array([ 54848.51172093, 112064.06260595,  87717.01967616,  70674.0896253 ,
        31718.82093763, 102325.24543403])

In [94]:
# To check coefficient

model.coef_

array([27585.10890795])

In [95]:
# To check intercept

model.intercept_

76557.95833333333

In [96]:
final = np.concatenate((y_predict.reshape(len(y_predict),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 54848.51172093,  54445.        ],
       [112064.06260595, 101302.        ],
       [ 87717.01967616,  81363.        ],
       [ 70674.0896253 ,  61111.        ],
       [ 31718.82093763,  46205.        ],
       [102325.24543403,  98273.        ]])

### USING DECISION TREE REGRESSION

In [97]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(criterion="squared_error",splitter="best", min_samples_split=3, min_samples_leaf=2)

# Training your model
model.fit(x_train, y_train)

# Making prediction on your Test data
y_predict2 = model.predict(x_test)
y_predict2

array([ 61617.33333333, 109608.33333333,  89588.66666667,  66983.5       ,
        38537.        , 109608.33333333])

In [98]:
final = np.concatenate((y_predict2.reshape(len(y_predict2),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 61617.33333333,  54445.        ],
       [109608.33333333, 101302.        ],
       [ 89588.66666667,  81363.        ],
       [ 66983.5       ,  61111.        ],
       [ 38537.        ,  46205.        ],
       [109608.33333333,  98273.        ]])

### USING RANDOM FOREST REGRESSION 

In [99]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, criterion="squared_error", min_samples_split=3, min_samples_leaf=2)

# Training your model
model.fit(x_train, y_train)

# Making prediction on your Test data
y_predict3 = model.predict(x_test)
y_predict3

array([ 60667.36666667, 111149.68166667,  92775.33333333,  69879.89595238,
        39654.51666667, 109079.07333333])

In [100]:
final = np.concatenate((y_predict3.reshape(len(y_predict3),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 60667.36666667,  54445.        ],
       [111149.68166667, 101302.        ],
       [ 92775.33333333,  81363.        ],
       [ 69879.89595238,  61111.        ],
       [ 39654.51666667,  46205.        ],
       [109079.07333333,  98273.        ]])

### SUPPORT VECTOR MACHINE

In [101]:
from sklearn.svm import LinearSVC
model = LinearSVC(penalty='l2',loss='squared_hinge',dual = 'auto')

# Training your model
model.fit(x_train, y_train)

# Making prediction on your Test data
y_predict4 = model.predict(x_test)
y_predict4

array([ 37731., 121872., 105582.,  60150.,  39343., 122391.])

In [102]:
final = np.concatenate((y_predict4.reshape(len(y_predict4),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 37731.,  54445.],
       [121872., 101302.],
       [105582.,  81363.],
       [ 60150.,  61111.],
       [ 39343.,  46205.],
       [122391.,  98273.]])

### USING NAIVE BAYES

In [103]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

# Training your model
model.fit(x_train, y_train)

# Making prediction on your Test data
y_predict5 = model.predict(x_test)
y_predict5

array([ 64445., 105582.,  91738.,  67938.,  39343., 113812.])

In [104]:
final = np.concatenate((y_predict5.reshape(len(y_predict5),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 64445.,  54445.],
       [105582., 101302.],
       [ 91738.,  81363.],
       [ 67938.,  61111.],
       [ 39343.,  46205.],
       [113812.,  98273.]])

### USING KNN

In [105]:
# Elbow method to know the value for k, by default k is 5
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

test_error_rates = []

for k in range(1, 20):
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(x_train, y_train)
    y_pred_test = knn_model.predict(x_test)
    test_error = mean_squared_error(y_test, y_pred_test)
    test_error_rates.append(test_error)
test_error_rates #It seems the error is least at 11 

[93519086.5,
 94049661.16666667,
 80293616.38888893,
 44737894.28125,
 55228187.786666654,
 51977291.20370371,
 57960669.97619045,
 56146579.677083336,
 51595215.73456792,
 31083282.983333338,
 15317759.831955917,
 10879362.85185185,
 20752307.924063124,
 25976450.897959176,
 43396342.91925925,
 64019929.546875,
 92799514.85986157,
 126069771.59516461,
 165061441.97414577]

In [106]:
from sklearn.neighbors import KNeighborsRegressor
model6 = KNeighborsRegressor(n_neighbors=11, weights='uniform', algorithm='auto',leaf_size=30,p=2,metric='minkowski')
model6.fit(x_train, y_train)
y_predict6 = model6.predict(x_test)
y_predict6

array([ 53874.81818182, 103407.90909091,  81944.54545455,  65984.45454545,
        52262.27272727, 103407.90909091])

In [107]:
final = np.concatenate((y_predict6.reshape(len(y_predict6),1), y_test.reshape(len(y_test),1)),1)
final

array([[ 53874.81818182,  54445.        ],
       [103407.90909091, 101302.        ],
       [ 81944.54545455,  81363.        ],
       [ 65984.45454545,  61111.        ],
       [ 52262.27272727,  46205.        ],
       [103407.90909091,  98273.        ]])

### Checking for best model

In [108]:
#mean_absolute_error for Linear Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_predict)

#mean_absolute_error for Decision Tree Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae2 = mean_absolute_error(y_test, y_predict2)

#mean_absolute_error for Random Forest Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae3 = mean_absolute_error(y_test, y_predict3)

#mean_absolute_error for SVM
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae4 = mean_absolute_error(y_test, y_predict4)

#mean_absolute_error for Naives Bayes
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae5 = mean_absolute_error(y_test, y_predict5)

#mean_absolute_error for KNN
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae6 = mean_absolute_error(y_test, y_predict6)

print(f"The mean_absolute_error for Linear Regression is {mae}")
print(f"The mean_absolute_error for Decision Tree Regression is {mae2}")
print(f"The mean_absolute_error for Random Forest Regression is {mae3}")
print(f"The mean_absolute_error for SVM Regression is {mae4}")
print(f"The mean_absolute_error for Naives Bayes is {mae5}")
print(f"The mean_absolute_error for KNN is {mae6}")

The mean_absolute_error for Linear Regression is 7603.518020791035
The mean_absolute_error for Decision Tree Regression is 8096.694444444444
The mean_absolute_error for Random Forest Regression is 8934.63904761905
The mean_absolute_error for SVM Regression is 15574.0
The mean_absolute_error for Naives Bayes is 8980.5
The mean_absolute_error for KNN is 3220.5454545454536


In [109]:
#Mean Squared Error (MSE) for Linear Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_predict)

#Mean Squared Error (MSE) for Decision Tree Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse2 = mean_squared_error(y_test, y_predict2)

#Mean Squared Error (MSE) for Random Forest Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse3 = mean_squared_error(y_test, y_predict3)

#Mean Squared Error (MSE) for SVM
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse4 = mean_squared_error(y_test, y_predict4)

#Mean Squared Error (MSE) for Naives Bayes
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse5 = mean_squared_error(y_test, y_predict5)

#Mean Squared Error (MSE) for KNN
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mse6 = mean_squared_error(y_test, y_predict6)

print(f"The Mean Squared Error for Linear Regression is {mse}")
print(f"The Mean Squared Error for Decision Tree Regression is {mse2}")
print(f"The Mean Squared Error for Random Forest Regression is {mse3}")
print(f"The Mean Squared Error for SVM Regression is {mse4}")
print(f"The Mean Squared Error for Naives Bayes is {mse5}")
print(f"The Mean Squared Error for KNN is {mse6}")

The Mean Squared Error for Linear Regression is 79013523.22577047
The Mean Squared Error for Decision Tree Regression is 68312232.17129628
The Mean Squared Error for Random Forest Regression is 83751603.71048285
The Mean Squared Error for SVM Regression is 319788524.3333333
The Mean Squared Error for Naives Bayes is 93519086.5
The Mean Squared Error for KNN is 15317759.831955917


In [110]:
#Root Mean Squared Error (RMSE) for Linear Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse = mean_squared_error(y_test, y_predict, squared=False)

#Root Mean Squared Error (RMSE) for Decision Tree Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse2 = mean_squared_error(y_test, y_predict2, squared=False)

#Root Mean Squared Error (RMSE) for Random Forest Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse3 = mean_squared_error(y_test, y_predict3, squared=False)

#Root Mean Squared Error (RMSE)) for SVM
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse4 = mean_squared_error(y_test, y_predict4, squared=False)

#Root Mean Squared Error (RMSE) for Naives Bayes
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse5 = mean_squared_error(y_test, y_predict5, squared=False)

#Root Mean Squared Error (RMSE) for KNN
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rmse6 = mean_squared_error(y_test, y_predict6, squared=False)

print(f"The Root Mean Squared Error for Linear Regression is {rmse}")
print(f"The Root Mean Squared Error for Decision Tree Regression is {rmse2}")
print(f"The Root Mean Squared Error for Random Forest Regression is {rmse3}")
print(f"The Root Mean Squared Error for SVM Regression is {rmse4}")
print(f"The Root Mean Squared Error for Naives Bayes is {rmse5}")
print(f"The Root Mean Squared Error for KNN is {rmse6}")

The Root Mean Squared Error for Linear Regression is 8888.955125647248
The Root Mean Squared Error for Decision Tree Regression is 8265.12142507878
The Root Mean Squared Error for Random Forest Regression is 9151.5902285058
The Root Mean Squared Error for SVM Regression is 17882.63191852176
The Root Mean Squared Error for Naives Bayes is 9670.526691964611
The Root Mean Squared Error for KNN is 3913.790979594582


In [111]:
# R-squared (coefficient of determination)  for Linear Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2 = r2_score(y_test, y_predict)

# R-squared (coefficient of determination) for Decision Tree Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_2 = r2_score(y_test, y_predict2)

# R-squared (coefficient of determination) for Random Forest Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_3 = r2_score(y_test, y_predict3)

# R-squared (coefficient of determination) for SVM
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_4 = r2_score(y_test, y_predict4)

# R-squared (coefficient of determination) for Naives Bayes
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_5 = r2_score(y_test, y_predict5)

# R-squared (coefficient of determination) for KNN
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_6 = r2_score(y_test, y_predict6)

print(f"The R-squared (coefficient of determination) for Linear Regression is {r2}")
print(f"The R-squared (coefficient of determination) for Decision Tree Regression is {r2_2}")
print(f"The R-squared (coefficient of determination) for Random Forest Regression is {r2_3}")
print(f"The R-squared (coefficient of determination) for SVM Regression is {r2_4}")
print(f"The R-squared (coefficient of determination) for Naives Bayes is {r2_5}")
print(f"The R-squared (coefficient of determination) for KNN is {r2_6}")

The R-squared (coefficient of determination) for Linear Regression is 0.8250362123724115
The R-squared (coefficient of determination) for Decision Tree Regression is 0.8487326422866438
The R-squared (coefficient of determination) for Random Forest Regression is 0.8145444323093848
The R-squared (coefficient of determination) for SVM Regression is 0.29187550215519875
The R-squared (coefficient of determination) for Naives Bayes is 0.7929157830013658
The R-squared (coefficient of determination) for KNN is 0.9660810812028866


In [131]:
#KNN seems to be the best model
YearsExperience = float(input("Enter Years Of Experience "))

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 12)

from sklearn.neighbors import KNeighborsRegressor
model6 = KNeighborsRegressor(n_neighbors=11, weights='uniform', algorithm='auto',leaf_size=30,p=2,metric='minkowski')

model6.fit(x_train, y_train)

x_test2 = [[YearsExperience]]
y_predict6 = model6.predict(x_test2)
print(f"The estimated salary = {y_predict6}")

Enter Years Of Experience 7.1
The estimated salary = [92567.54545455]
