# Regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
X = tips_df.drop(['tip'], axis=1)
y = tips_df['tip']

X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [4]:
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

In [5]:
numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis=1)

numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [6]:
categorical = X.filter(['sex', 'smoker', 'day', 'time'])
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [7]:
cat_numerical = pd.get_dummies(categorical, drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [8]:
X = pd.concat([numerical, cat_numerical], axis=1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
regressor = lin_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [11]:
from sklearn import metrics
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.7080218832979829
MSE:  0.8939195221609609
RMSE: 0.9454731736865731


## KNN Regression

In [12]:
from sklearn.neighbors import KNeighborsRegressor
KNN_reg = KNeighborsRegressor(n_neighbors=5)
regressor = KNN_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.7513877551020406
MSE:  0.9462902040816326
RMSE: 0.9727744877830794


## Random Forest Regression

In [13]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
regressor = rf_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.7185285714285715
MSE:  0.809937225918367
RMSE: 0.8999651248344944


## Support Vector Regression

In [14]:
from sklearn import svm
svm_reg = svm.SVR()

regressor = svm_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.7362521512772694
MSE:  0.9684825097223093
RMSE: 0.9841150896731079


## K-Fold Cross-Validation

In [15]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_absolute_error"))

[-0.66386205 -0.57007269 -0.63598762 -0.96960743 -0.87391702]


In [16]:
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)
regressor = rf_reg.fit(X_train, y_train)
single_record = sc.transform(X.values[100].reshape(1, -1))
predicted_tip = regressor.predict(single_record)
print(predicted_tip)

[2.26622]


### Predicting a price in the Diamonds dataset

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns 

diamonds_df = sns.load_dataset("diamonds")
diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [20]:
X = diamonds_df.drop(['price'], axis=1)
y = diamonds_df['price']

In [21]:
numerical = X.drop(['cut', 'color', 'clarity'], axis=1)
categorical = X.filter(['cut', 'color', 'clarity'])

cat_num = pd.get_dummies(categorical, drop_first=True)

X = pd.concat([numerical, cat_num], axis=1)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [24]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
regressor = rf_reg.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 276.3791664555267
Mean Squared Error: 317460.9020968205
Root Mean Squared Error: 563.4366886322016
