In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('job-data.csv')


In [3]:
df.head(3)

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons,Salary
0,12/10/2023,Junior Software Developer(Backend),Fresh graduate,Satisfied,On Site,No,I am happy with my work role,19000
1,03/01/2022,Data Scientist,More than 2 years,Satisfied,Remote,Yes,Better salary/benefits,50000
2,02/06/2022,Data Scientist,More than 2 years,Satisfied,On Site,Yes,"Career Development, Salary and Benefits",115000


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
X = df.drop('Salary', axis=1)
y = df.Salary

In [6]:
le_x = LabelEncoder()
X = X.apply(le_x.fit_transform)

In [7]:
X

Unnamed: 0,joining_date,job_title,required_experience,Overall_Satisfication,Working_model,Considering_Job_Switching_in_Future,Main_Reasons
0,149,19,0,2,1,0,4
1,36,5,6,2,2,1,0
2,30,5,6,2,1,1,1
3,13,31,6,4,1,0,4
4,40,33,6,2,1,1,6
...,...,...,...,...,...,...,...
538,172,30,15,4,1,1,2
539,187,27,14,1,0,1,0
540,122,41,6,4,1,1,2
541,160,21,0,2,0,0,4


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

In [9]:
from sklearn.svm import SVC

In [10]:
svm_model = SVC(kernel='rbf', C=1, gamma='auto')
svm_model.fit(X_train, y_train)

In [11]:
svm_model.score(X_test, y_test)

0.36764705882352944

### Fine Tuning and find best parameter for SVM, using GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(SVC(gamma='auto'), {
    'C' : [1, 10, 20, 30],
    'kernel' : ['rbf', 'linear', 'poly']
}, cv=5)

clf.fit(X, y)
clf.cv_results_



{'mean_fit_time': array([6.52830124e-02, 2.06131454e+00, 1.01443203e+03, 7.68259525e-02,
        1.25427284e+01, 1.11910266e+03, 5.81560612e-02, 1.85222070e+01,
        1.21472834e+03, 6.19801044e-02, 2.75802248e+01, 1.20213020e+03]),
 'std_fit_time': array([1.37962331e-02, 3.28700171e-01, 5.64527280e+02, 9.96096326e-03,
        3.18118116e+00, 6.57156381e+02, 6.58917819e-03, 4.47232706e+00,
        6.77247315e+02, 2.87915147e-03, 5.61894696e+00, 7.00947186e+02]),
 'mean_score_time': array([0.00897985, 0.01073437, 0.00862579, 0.01178994, 0.00994663,
        0.00862837, 0.01089697, 0.00799336, 0.00760307, 0.0079793 ,
        0.01080422, 0.00762644]),
 'std_score_time': array([0.00112121, 0.0033946 , 0.00206503, 0.00113633, 0.0037204 ,
        0.00282371, 0.00365626, 0.00567417, 0.00163839, 0.00447849,
        0.00408584, 0.00206441]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 20, 20, 20, 30, 30, 30],
              mask=[False, False, False, False, False, False, False, False,
 

In [13]:
new_df = pd.DataFrame(clf.cv_results_)
new_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.065283,0.013796,0.00898,0.001121,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.238532,0.183486,0.330275,0.342593,0.509259,0.320829,0.11109,4
1,2.061315,0.3287,0.010734,0.003395,1,linear,"{'C': 1, 'kernel': 'linear'}",0.12844,0.192661,0.266055,0.222222,0.287037,0.219283,0.056121,12
2,1014.432029,564.52728,0.008626,0.002065,1,poly,"{'C': 1, 'kernel': 'poly'}",0.229358,0.137615,0.321101,0.342593,0.490741,0.304281,0.118245,5
3,0.076826,0.009961,0.01179,0.001136,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.256881,0.211009,0.366972,0.351852,0.537037,0.34475,0.112362,1
4,12.542728,3.181181,0.009947,0.00372,10,linear,"{'C': 10, 'kernel': 'linear'}",0.155963,0.165138,0.266055,0.268519,0.287037,0.228542,0.056062,11
5,1119.102662,657.156381,0.008628,0.002824,10,poly,"{'C': 10, 'kernel': 'poly'}",0.229358,0.137615,0.321101,0.342593,0.490741,0.304281,0.118245,5
6,0.058156,0.006589,0.010897,0.003656,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.256881,0.211009,0.366972,0.351852,0.537037,0.34475,0.112362,1
7,18.522207,4.472327,0.007993,0.005674,20,linear,"{'C': 20, 'kernel': 'linear'}",0.183486,0.165138,0.275229,0.277778,0.296296,0.239585,0.054102,10
8,1214.728337,677.247315,0.007603,0.001638,20,poly,"{'C': 20, 'kernel': 'poly'}",0.229358,0.137615,0.321101,0.342593,0.490741,0.304281,0.118245,5
9,0.06198,0.002879,0.007979,0.004478,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.256881,0.211009,0.366972,0.351852,0.537037,0.34475,0.112362,1


In [14]:
clf.best_params_

{'C': 10, 'kernel': 'rbf'}

In [15]:
clf.best_score_

0.34475025484199795

In [16]:
svm_model_updated = SVC(kernel='rbf', C=10, gamma='auto')
svm_model_updated.fit(X_train, y_train)
svm_model_updated.score(X_test, y_test)

0.39705882352941174