In [37]:
import pandas as pd
import numpy as np

In [38]:
sales =[1234,4321,5678,8765,9876]
city = ['Delhi','Mumbai','Chennai','Kolkata','Bangalore']
size = ['S','M','L','XL','XXL']
df = pd.DataFrame({'City':city,'Sales':sales,'Size':size})
print(df)

        City  Sales Size
0      Delhi   1234    S
1     Mumbai   4321    M
2    Chennai   5678    L
3    Kolkata   8765   XL
4  Bangalore   9876  XXL


In [39]:
print(df['City'].unique())
print(df['City'].value_counts())
print(df['City'].nunique())

['Delhi' 'Mumbai' 'Chennai' 'Kolkata' 'Bangalore']
Delhi        1
Mumbai       1
Chennai      1
Kolkata      1
Bangalore    1
Name: City, dtype: int64
5


###                         **One Hot Encorder**

In [40]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder(handle_unknown='ignore',sparse_output=False).set_output(transform = 'pandas')

In [41]:
ohctransform = ohc.fit_transform(df[['City']])
print(ohctransform)

   City_Bangalore  City_Chennai  City_Delhi  City_Kolkata  City_Mumbai
0             0.0           0.0         1.0           0.0          0.0
1             0.0           0.0         0.0           0.0          1.0
2             0.0           1.0         0.0           0.0          0.0
3             0.0           0.0         0.0           1.0          0.0
4             1.0           0.0         0.0           0.0          0.0


In [42]:
#Concatenate the one hot encoded data with the original data
df = pd.concat([df,ohctransform],axis=1).drop(['City'],axis=1)
df.head()

Unnamed: 0,Sales,Size,City_Bangalore,City_Chennai,City_Delhi,City_Kolkata,City_Mumbai
0,1234,S,0.0,0.0,1.0,0.0,0.0
1,4321,M,0.0,0.0,0.0,0.0,1.0
2,5678,L,0.0,1.0,0.0,0.0,0.0
3,8765,XL,0.0,0.0,0.0,1.0,0.0
4,9876,XXL,1.0,0.0,0.0,0.0,0.0


### **Ordinal Encorder**

In [43]:
df.Size.unique()

array(['S', 'M', 'L', 'XL', 'XXL'], dtype=object)

In [44]:
sizes = ['S','M','L','XL','XXL']


In [45]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[sizes])
df['Size'] = oe.fit_transform(df[['Size']])
df.head()

Unnamed: 0,Sales,Size,City_Bangalore,City_Chennai,City_Delhi,City_Kolkata,City_Mumbai
0,1234,0.0,0.0,0.0,1.0,0.0,0.0
1,4321,1.0,0.0,0.0,0.0,0.0,1.0
2,5678,2.0,0.0,1.0,0.0,0.0,0.0
3,8765,3.0,0.0,0.0,0.0,1.0,0.0
4,9876,4.0,1.0,0.0,0.0,0.0,0.0


## **Handling missing data using Simple Imputer**

In [46]:
person = ['Kim','Paul',np.nan,'John','Raj']
score = [100,200,np.nan,400,500]
player = pd.DataFrame({'Person':person,'Score':score})
print(player)

  Person  Score
0    Kim  100.0
1   Paul  200.0
2    NaN    NaN
3   John  400.0
4    Raj  500.0


In [47]:
from sklearn.impute import SimpleImputer
mean = SimpleImputer(strategy='mean',add_indicator=True)
player['Score'] = mean.fit_transform(player[['Score']])
player['Score'] = mean.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [48]:
#Using median
median = SimpleImputer(strategy='median')
player['Score'] = median.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [49]:
#Using mode
mode = SimpleImputer(strategy='most_frequent')
player['Score'] = mode.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [50]:
#Constant value
constant = SimpleImputer(strategy='constant',fill_value=300)
player['Score'] = constant.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [51]:
constant = SimpleImputer(strategy='constant',fill_value='Judy')
player['Person'] = constant.fit_transform(player[['Person']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,Judy,300.0
3,John,400.0
4,Raj,500.0


### Dealing with csv data

In [52]:
housing = pd.read_csv(r'C:\Users\hp\Desktop\5.1\sankyana\Datasets\HOUSE_PRICE\MELBOURNE_HOUSE_PRICES_LESS.csv')
housing.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [53]:
housing.isnull().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price            14590
Method               0
SellerG              0
Date                 0
Postcode             0
Regionname           0
Propertycount        0
Distance             0
CouncilArea          0
dtype: int64

In [54]:
from sklern.compose import ColumnTransformer


ModuleNotFoundError: No module named 'sklern'