In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [25]:
from warnings import filterwarnings 
filterwarnings('ignore')

# Model & Tahmin

In [26]:
df = pd.read_csv("Hitters.csv")
#We're blowing up the missing values in
df=df.dropna()
#We converted the categorical variables in the data set into dummie variables.
#onehotencoder approach (gives "1" to the current value and "0" to the non-existing ones)
dms = pd.get_dummies(df[['League','Division','NewLeague']])
#dependent variable
y=df["Salary"]
#categorical variables before dummies and we remove the dependent variable from the data set
X_=df.drop(['Salary','League','Division','NewLeague'], axis=1).astype('float64')
#We combined our original data set and "X_" and created our independent variable.
#In summary, by converting categorical variables into dummie variables and keeping dummie variables in the data set
#we have excluded the other and dependent variable from the independent variables.
X=pd.concat([X_, dms[['League_N','Division_W','NewLeague_N']]],axis=1)
#test and train separation.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [27]:
df.head()
#Data set of baseball players.
#Our aim is to try to predict Salary, that is, the salary variables of the players using other variables.

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [28]:
#linear kernel, can be used in non-linear e.g. rbf.
svr_model = SVR(kernel="linear").fit(X_train, y_train)

In [29]:
#Penalty parameter, control parameter
print(svr_model.get_params())

{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [30]:
svr_model.predict(X_train)[0:5]

array([219.32622627, 702.43039317, 623.20559641, 153.77538484,
       463.15191157])

In [31]:
svr_model.predict(X_test)[0:5]

array([679.14754919, 633.72883529, 925.68639938, 270.28464317,
       530.26659421])

In [32]:
#Since svr consists of a functional structure, we can get the coefficients.
svr_model.intercept_

array([-80.15196063])

In [33]:
#When it is desired to write the mathematical form for this model, we can use coefficients such as beta0 beta1.
svr_model.coef_

array([[ -1.2183904 ,   6.09602978,  -3.67574533,   0.14217072,
          0.51435925,   1.28388992,  12.55922527,  -0.08693754,
          0.46597185,   2.98259931,   0.52944513,  -0.79820793,
         -0.16015531,   0.30872795,   0.28842348,  -1.79560066,
          6.41868986, -10.74313785,   1.33374319]])

In [34]:
#test failure
#primitive test error related to svr
y_pred = svr_model.predict(X_test)
#square root of the mean square root of the error squares
np.sqrt(mean_squared_error(y_test,y_pred))

370.0408415795005

In [35]:
#The term that we need to optimise is our "C" parameter (penalty parameter)
#We will give some possible C values and use the GridSearchCV method and the k-fold cross-validation method to decide which one might be better
#the parameters we want to search for
svr_params = {"C": [0.1,0.5,1,3]}

In [36]:
svr_cv_model = GridSearchCV(svr_model,svr_params, cv=5).fit(X_train,y_train)

In [37]:
svr_cv_model.best_params_

{'C': 0.5}

In [38]:
svr_tuned = SVR(kernel="linear",C=0.5).fit(X_train, y_train)

In [39]:
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

367.98747616655294

In [None]:
#Our primitive test failure was 370.