In [1]:
import pandas as pd
import numpy as np

In [74]:
sales =[1234,4321,5678,8765,9876]
city = ['Delhi','Mumbai','Chennai','Kolkata','Bangalore']
size = ['S','M','L','XL','XXL']
df = pd.DataFrame({'City':city,'Sales':sales,'Size':size})
print(df)

        City  Sales Size
0      Delhi   1234    S
1     Mumbai   4321    M
2    Chennai   5678    L
3    Kolkata   8765   XL
4  Bangalore   9876  XXL


In [3]:
print(df['City'].unique())
print(df['City'].value_counts())
print(df['City'].nunique())

['Delhi' 'Mumbai' 'Chennai' 'Kolkata' 'Bangalore']
Delhi        1
Mumbai       1
Chennai      1
Kolkata      1
Bangalore    1
Name: City, dtype: int64
5


###                         **One Hot Encorder**

In [4]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder(handle_unknown='ignore',sparse_output=False).set_output(transform = 'pandas')

In [5]:
ohctransform = ohc.fit_transform(df[['City']])
print(ohctransform)

   City_Bangalore  City_Chennai  City_Delhi  City_Kolkata  City_Mumbai
0             0.0           0.0         1.0           0.0          0.0
1             0.0           0.0         0.0           0.0          1.0
2             0.0           1.0         0.0           0.0          0.0
3             0.0           0.0         0.0           1.0          0.0
4             1.0           0.0         0.0           0.0          0.0


In [6]:
#Concatenate the one hot encoded data with the original data
df = pd.concat([df,ohctransform],axis=1).drop(['City'],axis=1)
df.head()

Unnamed: 0,Sales,Size,City_Bangalore,City_Chennai,City_Delhi,City_Kolkata,City_Mumbai
0,1234,S,0.0,0.0,1.0,0.0,0.0
1,4321,M,0.0,0.0,0.0,0.0,1.0
2,5678,L,0.0,1.0,0.0,0.0,0.0
3,8765,XL,0.0,0.0,0.0,1.0,0.0
4,9876,XXL,1.0,0.0,0.0,0.0,0.0


### **Ordinal Encorder**

In [7]:
df.Size.unique()

array(['S', 'M', 'L', 'XL', 'XXL'], dtype=object)

In [8]:
sizes = ['S','M','L','XL','XXL']


In [9]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[sizes])
df['Size'] = oe.fit_transform(df[['Size']])
df.head()

Unnamed: 0,Sales,Size,City_Bangalore,City_Chennai,City_Delhi,City_Kolkata,City_Mumbai
0,1234,0.0,0.0,0.0,1.0,0.0,0.0
1,4321,1.0,0.0,0.0,0.0,0.0,1.0
2,5678,2.0,0.0,1.0,0.0,0.0,0.0
3,8765,3.0,0.0,0.0,0.0,1.0,0.0
4,9876,4.0,1.0,0.0,0.0,0.0,0.0


## **Handling missing data using Simple Imputer**

In [10]:
person = ['Kim','Paul',np.nan,'John','Raj']
score = [100,200,np.nan,400,500]
player = pd.DataFrame({'Person':person,'Score':score})
print(player)

  Person  Score
0    Kim  100.0
1   Paul  200.0
2    NaN    NaN
3   John  400.0
4    Raj  500.0


In [11]:
from sklearn.impute import SimpleImputer
mean = SimpleImputer(strategy='mean',add_indicator=True)
player['Score'] = mean.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [12]:
#Using median
median = SimpleImputer(strategy='median')
player['Score'] = median.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [13]:
#Using mode
mode = SimpleImputer(strategy='most_frequent')
player['Score'] = mode.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [14]:
#Constant value
constant = SimpleImputer(strategy='constant',fill_value=300)
player['Score'] = constant.fit_transform(player[['Score']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,,300.0
3,John,400.0
4,Raj,500.0


In [15]:
constant = SimpleImputer(strategy='constant',fill_value='Judy')
player['Person'] = constant.fit_transform(player[['Person']])
player

Unnamed: 0,Person,Score
0,Kim,100.0
1,Paul,200.0
2,Judy,300.0
3,John,400.0
4,Raj,500.0


### Dealing with csv data

In [16]:
housing = pd.read_csv(r'C:\Users\hp\Desktop\5.1\sankyana\Datasets\HOUSE_PRICE\MELBOURNE_HOUSE_PRICES_LESS.csv')
housing.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [17]:
housing.isnull().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price            14590
Method               0
SellerG              0
Date                 0
Postcode             0
Regionname           0
Propertycount        0
Distance             0
CouncilArea          0
dtype: int64

In [18]:
from sklearn.compose import make_column_transformer


In [20]:
ct = make_column_transformer(
    (SimpleImputer(strategy='mean'),['Rooms','Price']),
    (SimpleImputer(strategy='most_frequent'),['Type','Method','Regionname']),
    (SimpleImputer(strategy='constant',fill_value='Unknown'),['SellerG']),
    remainder='drop'
)

In [21]:
ct.set_output(transform='pandas')

In [22]:
df3_pandas = ct.fit_transform(housing)
df3_pandas.head()

Unnamed: 0,simpleimputer-1__Rooms,simpleimputer-1__Price,simpleimputer-2__Type,simpleimputer-2__Method,simpleimputer-2__Regionname,simpleimputer-3__SellerG
0,3.0,1490000.0,h,S,Northern Metropolitan,Jellis
1,3.0,1220000.0,h,S,Northern Metropolitan,Marshall
2,3.0,1420000.0,h,S,Northern Metropolitan,Nelson
3,3.0,1515000.0,h,S,Western Metropolitan,Barry
4,2.0,670000.0,h,S,Western Metropolitan,Nelson


### Column transformers

In [23]:
sales =[1234,4321,5678,8765,9876]
city = ['Delhi','Mumbai','Chennai','Kolkata','Bangalore']
size = ['S','M','L','XL','XXL']
df1 = pd.DataFrame({'City':city,'Sales':sales,'Size':size})
print(df1)

        City  Sales Size
0      Delhi   1234    S
1     Mumbai   4321    M
2    Chennai   5678    L
3    Kolkata   8765   XL
4  Bangalore   9876  XXL


In [24]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [25]:
ohe = OneHotEncoder(sparse_output = False)
oe = OrdinalEncoder()

In [26]:
from sklearn.compose import make_column_transformer

In [27]:
ct = make_column_transformer(
    (ohe,['City']),
    (oe,['Size']),
    remainder='passthrough'
)

In [28]:
ct.set_output(transform='pandas')

In [29]:
df_pandas = ct.fit_transform(df1)
df_pandas

Unnamed: 0,onehotencoder__City_Bangalore,onehotencoder__City_Chennai,onehotencoder__City_Delhi,onehotencoder__City_Kolkata,onehotencoder__City_Mumbai,ordinalencoder__Size,remainder__Sales
0,0.0,0.0,1.0,0.0,0.0,2.0,1234
1,0.0,0.0,0.0,0.0,1.0,1.0,4321
2,0.0,1.0,0.0,0.0,0.0,0.0,5678
3,0.0,0.0,0.0,1.0,0.0,3.0,8765
4,1.0,0.0,0.0,0.0,0.0,4.0,9876


In [30]:
ct1 = make_column_transformer(
    (ohe,[0]),
    (oe,[1]),
    remainder='drop'
)

In [31]:
ct1.set_output(transform='pandas')

In [32]:
df1_pandas=ct1.fit_transform(df1)
df1_pandas

Unnamed: 0,onehotencoder__City_Bangalore,onehotencoder__City_Chennai,onehotencoder__City_Delhi,onehotencoder__City_Kolkata,onehotencoder__City_Mumbai,ordinalencoder__Sales
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,1.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,1.0,0.0,3.0
4,1.0,0.0,0.0,0.0,0.0,4.0


In [33]:
ct2 = make_column_transformer(
    (ohe,[0]),
    ('passthrough',['Size']),
    remainder='drop'
)

In [34]:
ct2.set_output(transform='pandas')

In [35]:
df2_pandas=ct2.fit_transform(df1)
df2_pandas

Unnamed: 0,onehotencoder__City_Bangalore,onehotencoder__City_Chennai,onehotencoder__City_Delhi,onehotencoder__City_Kolkata,onehotencoder__City_Mumbai,passthrough__Size
0,0.0,0.0,1.0,0.0,0.0,S
1,0.0,0.0,0.0,0.0,1.0,M
2,0.0,1.0,0.0,0.0,0.0,L
3,0.0,0.0,0.0,1.0,0.0,XL
4,1.0,0.0,0.0,0.0,0.0,XXL


### **Feature Scaling**
### Normalization vs standardization
**Scaling**- converts numerical values into the exact same scale, mean = 0, std deviation = 1<br>
**Normalization**- Allows you to have numbers 0, min-0, max=1

In [36]:
import pandas as pd
import numpy as np

In [37]:
admin = pd.read_csv(r"C:\Users\hp\Desktop\5.1\sankyana\Assignment\ML project 1\Admission_Predict_Ver1.1.csv")

In [38]:
admin.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [39]:
admin1 = admin.drop('Serial No.',axis = 1)

In [46]:
X1 = admin1.iloc[:,0:7]
X2 = admin1.iloc[:,0:7]
Y = admin1.iloc[:,7]
print(X1.head())

   GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  Research
0        337          118                  4  4.5   4.5  9.65         1
1        324          107                  4  4.0   4.5  8.87         1
2        316          104                  3  3.0   3.5  8.00         1
3        322          110                  3  3.5   2.5  8.67         1
4        314          103                  2  2.0   3.0  8.21         0


In [41]:
#Standardization
from sklearn.preprocessing import StandardScaler

In [42]:
scaler = StandardScaler()
X1 = scaler.fit_transform(X1)
X1 = pd.DataFrame(X1,columns=['GRE Score','TOEFL Score','University Rating','SOP','LOR','CGPA','Research'])
X1.describe().round(3)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0
std,1.001,1.001,1.001,1.001,1.001,1.001,1.001
min,-2.346,-2.5,-1.851,-2.398,-2.687,-2.94,-1.128
25%,-0.751,-0.69,-0.975,-0.883,-0.524,-0.743,-1.128
50%,0.047,-0.032,-0.1,0.127,0.017,-0.027,0.886
75%,0.756,0.791,0.776,0.632,0.558,0.767,0.886
max,2.085,2.108,1.651,1.642,1.64,2.224,0.886


In [43]:
#Normalization
from sklearn.preprocessing import MinMaxScaler

In [44]:
scaler = MinMaxScaler(feature_range=(0,1))
X2 = scaler.fit_transform(X2)
X2 = pd.DataFrame(X2)
X2.describe().round(3)

Unnamed: 0,0,1,2,3,4,5,6
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.529,0.543,0.528,0.594,0.621,0.569,0.56
std,0.226,0.217,0.286,0.248,0.231,0.194,0.497
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.36,0.393,0.25,0.375,0.5,0.425,0.0
50%,0.54,0.536,0.5,0.625,0.625,0.564,1.0
75%,0.7,0.714,0.75,0.75,0.75,0.718,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Making piplines

In [45]:
from sklearn.model_selection import train_test_split

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X1,Y,test_size=0.3,random_state=19)

In [63]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

In [64]:
si = SimpleImputer(strategy="mean")
lr = LinearRegression()

In [65]:
mp = make_pipeline(si,lr)

In [66]:
mp.fit(X_train,y_train)

In [68]:
mp.score(X_train,y_train)

0.8247573785768838

In [69]:
mp.score(X_test,y_test)

0.8120639633096072

In [71]:
mp.named_steps.simpleimputer.statistics_

array([316.60285714, 107.15142857,   3.10857143,   3.37571429,
         3.47571429,   8.5812    ,   0.55142857])

In [72]:
mp.named_steps.linearregression.coef_

array([0.00257968, 0.00227323, 0.00472257, 0.00500216, 0.01666373,
       0.10994521, 0.01660816])

In [None]:
#More advanced piplines

In [76]:
X= df.iloc[:,0:2]
y = df.iloc[:,2]

In [77]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [78]:
df

Unnamed: 0,City,Sales,Size
0,Delhi,1234,S
1,Mumbai,4321,M
2,Chennai,5678,L
3,Kolkata,8765,XL
4,Bangalore,9876,XXL


In [79]:
num_cols = ['Sales']
cat_cols = ['City']


In [84]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [86]:
num_pipline = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
])

In [87]:
cat_pipline = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))

])

In [98]:
from sklearn.compose import ColumnTransformer

In [99]:
ct = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipline,num_cols),
    ('cat_pipline',cat_pipline,cat_cols)
],
remainder = 'drop',
n_jobs = -1
)

In [100]:
from sklearn.tree import DecisionTreeClassifier

In [101]:
dtc = DecisionTreeClassifier()

In [102]:
pipefinal = make_pipeline(ct,dtc)

In [103]:
pipefinal.fit(X_train,y_train)

In [104]:
pipefinal.score(X_test,y_test)

0.0

In [105]:
#How to save pipline
import joblib

In [None]:
joblib.dump(pipefinal,"pipe.joblib")

In [None]:
pipefinal2 =joblib.load("pipe.joblib")