# Regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [19]:
tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Dividing Data into Features and Labels

In [3]:
X = tips_df.drop(['tip'],axis=1)
y = tips_df["tip"]

In [4]:
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [5]:
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

## Converting Categorical Data to Numbers

In [6]:
numerical = X.drop(["sex","smoker","day","time"],axis=1)
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [7]:
categorical = X.filter(["sex","smoker","day","time"],axis=1)
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [8]:
cat_numerical = pd.get_dummies(categorical,drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [10]:
X = pd.concat([numerical,cat_numerical],axis=1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [12]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Linear Regression

### Pros
1. Linear regression is a simple to implement and easily interpretable algorithm.
2. Take less time to train, even for huge dataset. 
3. Linear regression coefficients are easy to interpret. 

### Cons
1. Performance is easily affected by outlier presence. 
2. Assumes a linear relationship btw dependent and independent variables, which can result in an increased error. 

In [13]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
regressor = lin_reg.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

### Error Functions

- Mean Absolute Error
- Mean Squared Error
- Root Mean Squared Error

In [15]:
from sklearn import metrics

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.6366762541802364
MSE:  0.7159134231087628
RMSE:  0.846116672279162


## KNN Regression

### Pros

1. KNN doesn't assume any relationship btw features. 
2. Useful for a dataset where data localization is important. 
3. Only have to tune the parameter K which is the number of nearest neighbors. 
4. No training is needed, as it is a lazy learning algorithm. 
5. Recommender systems and finding semantic similarity btw the docs are major applications of the KNN. 

### Cons
1. You have to find the optimal value for K which isn't easy. 
2. Not suitable for very high dim data. 

In [16]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

knn_reg = KNeighborsRegressor(n_neighbors=5)

regressor = knn_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.7077959183673468
MSE:  0.8681808163265307
RMSE:  0.9317622101837628


## Random Forest

### Pros
1. You have a lot missing data or imbalanced dataset. 
2. You can avoid overfitting when use a lot tree models. 
3. It can be used when you have high dim data. 
4. Through cross-validation, the random forest can achieved a high accuracy. 
5. It can solve both classification problem and regression problem. 

### Cons
1. Using a large number of trees can slow down the algorithm. 
2. Random forest algorithm is a predictive algorithm, which can only predict the future but cannot explain what happended in the past and current. 

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

rfr_reg = RandomForestRegressor(random_state=42,n_estimators=500)

regressor = rfr_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.68998530612245
MSE:  0.8162551050938808
RMSE:  0.9034683752594115


## Making Prediction with a Single Record

In [21]:
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [22]:
single_record = sc.transform(X.values[100].reshape(1,-1))
predicted_tips = regressor.predict(single_record)
predicted_tips



array([1.80906])

## Multiple Outputs

In [24]:
from sklearn.datasets import make_regression

X,y=make_regression(n_samples=2000,n_features=8,n_informative=4,n_targets=3,random_state=42,noise=0.3)
print(X.shape,y.shape)

(2000, 8) (2000, 3)


### By using Linear Regression

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [38]:
lin_reg = LinearRegression()
regressor = lin_reg.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.2440062200409584
MSE:  0.09288200051053673
RMSE:  0.3047654844475285


In [39]:
single_record = sc.transform(X_test[50].reshape(1,-1))
predicted_val = regressor.predict(single_record)
predicted_val

array([[ 52.14499321, 154.07153888,  29.65411176]])

### By using Random Forest

In [32]:
rfr_reg = RandomForestRegressor(random_state=42,n_estimators=500)

regressor = rfr_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  17.57531467189482
MSE:  737.4958744399141
RMSE:  27.156875270176318


In [34]:
single_record = sc.transform(X_test[50].reshape(1,-1))
predicted_val = regressor.predict(single_record)
predicted_val

array([[ 15.29925902, 114.41624666,  12.90183432]])

### By using LinearSVR

In [42]:
from sklearn.svm import LinearSVR
from sklearn.multioutput  import MultiOutputRegressor

svr_reg = LinearSVR()
wrap_clf = MultiOutputRegressor(svr_reg)
regressor = wrap_clf.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.246230582141114
MSE:  0.09442034104816721
RMSE:  0.3072789303681058




In [43]:
single_record = sc.transform(X_test[50].reshape(1,-1))
predicted_val = regressor.predict(single_record)
predicted_val, y_test[50]

(array([[ 52.09758776, 154.03546895,  29.65585514]]),
 array([ 50.3331556 , 155.43458476,  26.52621361]))

### By using Regression Chain

In [44]:
from sklearn.svm import LinearSVR
from sklearn.multioutput  import RegressorChain

svr_reg = LinearSVR()
wrap_clf = RegressorChain(svr_reg,order=[0,1,2])
regressor = wrap_clf.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  0.4145163296456884
MSE:  0.2850170835623316
RMSE:  0.5338699125838912




In [45]:
single_record = sc.transform(X_test[50].reshape(1,-1))
predicted_val = regressor.predict(single_record)
predicted_val, y_test[50]

(array([[ 52.11710137, 153.74191829,  30.11411515]]),
 array([ 50.3331556 , 155.43458476,  26.52621361]))