In [1]:
import pandas as pd 



In [2]:
df = pd.read_csv("homeprices (1).csv")
df.shape

(13, 3)

In [3]:
# check the datatype of each column
df.dtypes

town     object
area      int64
price     int64
dtype: object

In [4]:
# check any column contain null value
df.isnull().sum()

town     0
area     0
price    0
dtype: int64

In [5]:
# show top 5 to analyse how the data looks like
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


### pd.get_dummies(dataframe, dtype=int)

In [9]:
# convert 'Car Model' column to numerical value
dummy = pd.get_dummies(df['town'], dtype=int)
dummy

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [11]:
# merge the above dummy value into the dataframe  using concat
merged  = pd.concat([df, dummy], axis=1)
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [13]:
# remove 'town'and 'west windsor' column from the merged dataframe and stored the rest in new dataframe called final
# we remove one of the dummy values of the three to avoid multicollinearity 
final = merged.drop(['town', 'west windsor'], axis=1)
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [21]:
X = final[['area', 'monroe township', 'robinsville']].values
X

array([[2600,    1,    0],
       [3000,    1,    0],
       [3200,    1,    0],
       [3600,    1,    0],
       [4000,    1,    0],
       [2600,    0,    0],
       [2800,    0,    0],
       [3300,    0,    0],
       [3600,    0,    0],
       [2600,    0,    1],
       [2900,    0,    1],
       [3100,    0,    1],
       [3600,    0,    1]], dtype=int64)

In [23]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [14]:
# import LinearRegression 
from sklearn.linear_model import LinearRegression

In [24]:
# train the dataset
lrg = LinearRegression()
lrg.fit(X, y)

In [25]:
# predict the dataset 
lrg.predict(X)

array([539709.7398409 , 590468.71640508, 615848.20468716, 666607.18125134,
       717366.15781551, 579723.71533005, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

In [19]:
# predict price of dataset whose area and town is in 2800 and robinsville
lrg.predict([[2800, 0, 1]])

array([590775.63964739])

In [26]:
# accuracy of the model 'lrg'
lrg.score(X, y)

0.9573929037221873

## LabelEncoder
Inorder to do OneHot Encoder on a dataset we need to do LabelEncoding first
**LabelEncoder** will return integer value from 0 to goes on...eg. 0, 1, 2 for three different value

In [27]:
# import preprocessor called LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [29]:
# build LabelEncoder object, fit the model and transform it
le = LabelEncoder()
ft = le.fit_transform(df.town)
ft

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

In [30]:
df2 = df.copy()
df2

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [32]:
# replace the value of column 'town'
df2['town'] = ft
df2

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [52]:
x = df2[['town', 'area']]
x

Unnamed: 0,town,area
0,0,2600
1,0,3000
2,0,3200
3,0,3600
4,0,4000
5,2,2600
6,2,2800
7,2,3300
8,2,3600
9,1,2600


In [53]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
# also import ColumnTransformer for newer version of anaconda and pandas to transform specific column with OneHotEncoder or any
# types of preprocessing
# we can use them inside ColumnTransformer to perform specific task for specific columns using ColumnTransformer
from sklearn.compose import ColumnTransformer

In [54]:
ct = ColumnTransformer(
[("OneHotEncoder", OneHotEncoder(), [0])], 
remainder="passthrough"
)
ct.fit_transform(x)

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [55]:
# remove the first column to avoid multicollinearity
x = ct.fit_transform(x)[:,1:]
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [56]:
lrg.fit(x,y)

In [60]:
lrg.predict([[1,0,2800]])

array([590775.63964739])