## Encoding Categoricals

In [62]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
data = pd.read_csv('regression_data.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,7.760467,17.343389,4.181353,Male,21.0
1,3.08,6.20859,16.230984,4.150313,Male,3.0
2,7.5,7.113956,18.047227,4.205057,Female,20.0
3,6.7,5.783825,11.73711,4.055333,Male,5.0
4,8.785714,6.64379,12.494862,4.088969,Female,10.0


In [3]:
# X-y split
# watch out when is the best time for you to do the splits
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)

In [5]:
# now let's work with Categoricals
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

Unnamed: 0,gender_Male,gender_U
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0
...,...,...
4665,1,0
4666,1,0
4667,0,0
4668,1,0


In [17]:
transformer = Normalizer().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)

(4670, 4)


In [None]:
cat_data = pd.get_dummies(X_cat, drop_first=True) # if you need to save the encoding info, this won't do
cat_data

In [6]:
# in case you need to use the encode somewhere else besides your notebook:
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [7]:
# after the .fit()
{'Male': [1,0],
'Female': [0,0],
'U': [0,1]}

{'Male': [1, 0], 'Female': [0, 0], 'U': [0, 1]}

In [8]:
encoded = encoder.transform(X_cat).toarray()
encoded #.shape # 

array([[1., 0.],
       [1., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [1., 0.],
       [1., 0.]])

In [49]:
encoded.shape

(4670, 2)

In [13]:
encoder.categories_

[array(['Female', 'Male', 'U'], dtype=object)]

In [14]:
# just for fun, you can work with the numpy arrays for the model as well
df = pd.DataFrame(encoded)
df.columns = encoder.categories_[0][1:]
df

Unnamed: 0,Male,U
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
4665,1.0,0.0
4666,1.0,0.0
4667,0.0,0.0
4668,1.0,0.0


In [33]:
# Extra: also check out label encoding
le = LabelEncoder().fit(X_cat).transform(X_cat)
le[:20]

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0])

In [27]:
X_cat['gender'].value_counts()

Female    2664
Male      1895
U          111
Name: gender, dtype: int64

In [32]:
X_cat['gender'].head(20)

0       Male
1       Male
2     Female
3       Male
4     Female
5       Male
6     Female
7       Male
8     Female
9     Female
10    Female
11    Female
12      Male
13    Female
14    Female
15    Female
16    Female
17         U
18    Female
19    Female
Name: gender, dtype: object

In [54]:
x_normalized.shape, le.shape, encoded.shape

((4670, 4), (4670,), (4670, 2))

--------

In [55]:
X = np.concatenate([x_normalized, np.expand_dims(le, axis=1)], axis=1)

In [56]:
# traint-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [57]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [58]:
predictions  = model.predict(X_test)
predictions.shape

(1401,)

In [39]:
# without normalized data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.2705944957872364, 11.414726728148338, 130.29598627830407)

In [45]:
# with normalized data
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.34480717480585454, 10.818463076649035, 117.03914334081851)

In [59]:
# with normalized data and label encoding
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.34473225081636627, 10.819081626800385, 117.05252724736967)

###### IMPORTANT: to make predictions on the new data, we have to process the data (X features) in the same way.

------

In [60]:
mse = mean_squared_error(y_test, predictions)
print(mse)

117.05252724736967


In [63]:
rmse = math.sqrt(mse)
print(rmse)

10.819081626800385


In [64]:
r2 = r2_score(y_test, predictions)
print(r2)

0.34473225081636627


In [65]:
n = len(X_test)
p = X_test.shape[1]
adj_r2 = 1-((1-r2)*(n-1)/(n-p-1))
print(adj_r2)

0.34238362089097685
