## One-Hot Encoding

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from seaborn import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
flights = load_dataset('flights')

## Data

In [6]:
#check head of dataset
flights.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [7]:
#Train-test split. Year and montth as X,passenger as y
X_train,X_test,y_train,y_test = train_test_split(
    flights[['year','month']],flights['passengers'],random_state=42
    )

## One_HotEncode

In [13]:
#instantiate th encoder and fit
ohe = OneHotEncoder()
columns_to_encode = ['month']
ohe.fit(X_train[columns_to_encode])

In [14]:
#Transform the column
encoded = ohe.transform(X_train[columns_to_encode])
encoded

<108x12 sparse matrix of type '<class 'numpy.float64'>'
	with 108 stored elements in Compressed Sparse Row format>

# inflating

In [15]:
encoded.todense()

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Getting New Feature Names

In [17]:
# access the feature names
ohe.get_feature_names_out()

array(['month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep'], dtype=object)

## Put into DataFrame



In [26]:
#Turn into a dataframe
new_train_df = pd.DataFrame(
    encoded.todense(),
    columns=ohe.get_feature_names_out(),
    index = X_train.index
    )
new_train_df.head()

Unnamed: 0,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
111,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
127,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### All Together

In [30]:
#stick together with X_train and drop the dummied-out column
df_train_concact = pd.concat([X_train,new_train_df],axis=1).drop('month',axis=1)
df_train_concact.head()

Unnamed: 0,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
111,1958,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,1952,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
118,1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
127,1959,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,1957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
#fit a Linear regression model
lr = LinearRegression()
lr.fit(df_train_concact,y_train)

In [32]:
#score it
lr.score(df_train_concact,y_train)

0.9578728640256422

## Test set

In [33]:
#encode test data
test_encoded = ohe.transform(X_test[columns_to_encode])



In [None]:
#Turn into a dataframe
new_test_df = pd.DataFrame(
              test_encoded.todense(),
              columns= ohe.get_feature_names_out(),
              index=X_test.index
)
new_test_df.head()

Unnamed: 0,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
#Add year back and drop the month
df_test_concat= pd.concat([X_test,new_test_df],axis=1).drop('month',axis=1)
df_test_concat.head()

Unnamed: 0,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
117,1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19,1950,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,1955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97,1957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,1953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [36]:
#Model score on Test
lr.score(df_test_concat,y_test)

0.9352318155740829