## Encoding Categoricals
- Encode categorical data
- Fit the model on the training data
- Make predictions on the test data

In [86]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [110]:
data = pd.read_csv('regression_data.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,7.760467,17.343389,4.181353,Male,21.0
1,3.08,6.20859,16.230984,4.150313,Male,3.0
2,7.5,7.113956,18.047227,4.205057,Female,20.0
3,6.7,5.783825,11.73711,4.055333,Male,5.0
4,8.785714,6.64379,12.494862,4.088969,Female,10.0


In [111]:
# X-y split
# watch out when is the best time for you to do the splits
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)

In [112]:
# now let's work with Categoricals
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

In [113]:
X_cat

Unnamed: 0,gender
0,Male
1,Male
2,Female
3,Male
4,Female
...,...
4665,Male
4666,Male
4667,Female
4668,Male


In [114]:
X_cat['gender'].value_counts()

Female    2664
Male      1895
U          111
Name: gender, dtype: int64

In [115]:
transformer = Normalizer().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)

(4670, 4)


In [116]:
x_normalized

array([[0.62312259, 0.31198208, 0.69722951, 0.16809648],
       [0.16988261, 0.34244528, 0.89524739, 0.22891755],
       [0.35345786, 0.3352645 , 0.85052457, 0.19817472],
       ...,
       [0.54236194, 0.313505  , 0.75336132, 0.20001218],
       [0.6463473 , 0.31813189, 0.66255726, 0.20504914],
       [0.68267219, 0.25989219, 0.66342472, 0.16211839]])

In [117]:
cat_data = pd.get_dummies(X_cat, drop_first=True) # if you need to save the encoding info, this won't do
cat_data[:20]

Unnamed: 0,gender_Male,gender_U
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0
5,1,0
6,0,0
7,1,0
8,0,0
9,0,0


In [118]:
# in case you need to use the encode somewhere else besides your notebook:
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [102]:
# X_cat_new = X_cat.copy()

In [104]:
# X_cat_new.loc[0,'gender'] = 'abc'

In [106]:
# X_cat_new.gender.value_counts()

Female    2664
Male      1894
U          111
abc          1
Name: gender, dtype: int64

In [107]:
# encoder = OneHotEncoder(handle_unknown='error', drop='first')
# encoder.fit(X_cat_new)

# encoded = encoder.transform(X_cat_new).toarray()
# encoded #.shape # 

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 0.],
       ...,
       [0., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [None]:
# 0,0,0
# 0,0,1
# 0,1,0
# 1,0,0

In [119]:
# after the .fit()
{'Male': [1,0],
'Female': [0,0],
'U': [0,1]}

{'Male': [1, 0], 'Female': [0, 0], 'U': [0, 1]}

In [120]:
encoded = encoder.transform(X_cat).toarray()
encoded #.shape # 

array([[1., 0.],
       [1., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [1., 0.],
       [1., 0.]])

In [121]:
encoded.shape

(4670, 2)

In [122]:
encoder.categories_

[array(['Female', 'Male', 'U'], dtype=object)]

In [123]:
# just for fun, you can work with the numpy arrays for the model as well
df = pd.DataFrame(encoded)
df.columns = encoder.categories_[0][1:]
df

Unnamed: 0,Male,U
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
4665,1.0,0.0
4666,1.0,0.0
4667,0.0,0.0
4668,1.0,0.0


In [33]:
# Extra: also check out label encoding
le = LabelEncoder().fit(X_cat).transform(X_cat)
le[:20]

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0])

In [27]:
X_cat['gender'].value_counts()

Female    2664
Male      1895
U          111
Name: gender, dtype: int64

In [32]:
X_cat['gender'].head(20)

0       Male
1       Male
2     Female
3       Male
4     Female
5       Male
6     Female
7       Male
8     Female
9     Female
10    Female
11    Female
12      Male
13    Female
14    Female
15    Female
16    Female
17         U
18    Female
19    Female
Name: gender, dtype: object

In [138]:
x_normalized.shape, le.shape, encoded.shape, np.expand_dims(le, axis=1).shape

((4670, 4), (4670,), (4670, 2), (4670, 1))

--------

### Comparing models

In [130]:
# X = np.concatenate([X_num, encoded], axis=1)  # using non-normalized and one-hot encoded data
X = np.concatenate([x_normalized, encoded], axis=1)  # using normalized and one-hot encoded data
# X = np.concatenate([X_num, np.expand_dims(le, axis=1)], axis=1)  # using non-normalized and label encoded data
# X = np.concatenate([x_normalized, np.expand_dims(le, axis=1)], axis=1)  # using normalized and label encoded data

In [131]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

x,y
1,3
2,5
3,7
4,9
5,?-> 11

y = 2*x + 1

x1,x2,y
1,1,2
2,3,5
3,10,13
4,15,19
5,20,?-> 25

y = x1 + x2

In [132]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [133]:
predictions  = model.predict(X_test)
predictions.shape

(1401,)

In [129]:
# using non-normalized and one-hot encoded data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.2705944957872364, 11.414726728148338, 130.29598627830407)

In [134]:
# using normalized and one-hot encoded data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.34480717480585454, 10.818463076649035, 117.03914334081851)

In [80]:
# using non-normalized and label encoded data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.2705750152363272, 11.414879156173063, 130.29946615003425)

In [85]:
# using normalized and label encoded data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.34473225081636627, 10.819081626800385, 117.05252724736967)

###### IMPORTANT: to make predictions on the new data, we have to process the data (X features) in the same way.

------

In [135]:
mse = mean_squared_error(y_test, predictions)
print(mse)

117.03914334081851


In [63]:
rmse = math.sqrt(mse)
print(rmse)

10.819081626800385


In [136]:
r2 = r2_score(y_test, predictions)
print(r2)

0.34480717480585454


In [137]:
n = len(X_test)
p = X_test.shape[1]
adj_r2 = 1-((1-r2)*(n-1)/(n-p-1))
print(adj_r2)

0.34198711960415806
