In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [35]:
pdf = sns.load_dataset('penguins')
pdf.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [36]:
pdf.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [37]:
pdf.dtypes[pdf.dtypes=='object']

species    object
island     object
sex        object
dtype: object

In [38]:
strcols = pdf.dtypes[pdf.dtypes == 'object'].index.tolist()
strcols

['species', 'island', 'sex']

In [39]:
numcols = pdf.dtypes[pdf.dtypes != 'object'].index.tolist()
numcols

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

## find missing entries in each column

In [40]:
pdf.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [41]:
pdf.shape[0]

344

In [42]:
100*pdf.isna().sum()/pdf.shape[0]

species              0.000000
island               0.000000
bill_length_mm       0.581395
bill_depth_mm        0.581395
flipper_length_mm    0.581395
body_mass_g          0.581395
sex                  3.197674
dtype: float64

-- Fill all missing values

In [43]:
for col in numcols:
  pdf[col]= pdf[col].fillna(pdf[col].mean())

In [44]:
100*pdf.isna().sum()/pdf.shape[0]

species              0.000000
island               0.000000
bill_length_mm       0.000000
bill_depth_mm        0.000000
flipper_length_mm    0.000000
body_mass_g          0.000000
sex                  3.197674
dtype: float64

In [45]:
pdf2 = pdf[:]
pdf2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [46]:
pdf2.sex.value_counts()

Male      168
Female    165
Name: sex, dtype: int64

In [47]:
 pdf2.sex.mode().values[0]

'Male'

In [48]:
pdf2.sex= pdf2.sex.fillna(pdf2.sex.mode().values[0])


In [49]:
100*pdf2.isna().sum()/pdf2.shape[0]

species              0.0
island               0.0
bill_length_mm       0.0
bill_depth_mm        0.0
flipper_length_mm    0.0
body_mass_g          0.0
sex                  0.0
dtype: float64

In [50]:
pdf2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## convert Categorical varibales to numericals 


In [51]:
sps = pdf2.species.unique().tolist()
sps

['Adelie', 'Chinstrap', 'Gentoo']

In [52]:
isd = pdf2.island.unique().tolist()
isd

['Torgersen', 'Biscoe', 'Dream']

In [53]:
gnd = pdf2.sex.unique().tolist()
gnd

['Male', 'Female']

### Assign numerical values to each category

In [54]:
count =0 
spsnum={}
for name in sps:
  spsnum[name]=count
  count+=1

spsnum

{'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2}

In [55]:
spsnum={}
for count,name in enumerate(sps):
  spsnum[name]=count

spsnum

{'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2}

In [56]:
spsnum = {name:count for count, name in enumerate(sps)}
spsnum

{'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2}

In [57]:
isdnum = {name:count for count, name in enumerate(isd)}
isdnum

{'Biscoe': 1, 'Dream': 2, 'Torgersen': 0}

In [58]:
gndnum = {name:count for count, name in enumerate(gnd)}
gndnum

{'Female': 1, 'Male': 0}

In [59]:
pdf2.species.map(spsnum).isna().sum()

0

In [60]:
catnums = { **spsnum ,**isdnum , **gndnum }
catnums

{'Adelie': 0,
 'Biscoe': 1,
 'Chinstrap': 1,
 'Dream': 2,
 'Female': 1,
 'Gentoo': 2,
 'Male': 0,
 'Torgersen': 0}

In [61]:
#pdf2.map(catnums)
pdf2.applymap(lambda x :catnums.get(x)).head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,0,,,,,0
1,0,0,,,,,1
2,0,0,,,,,1
3,0,0,,,,,0
4,0,0,,,,,1


In [62]:
catcols = pdf2.dtypes[pdf2.dtypes =='object'].index.tolist()
catcols

['species', 'island', 'sex']

In [63]:
#catnums[39.1]
catnums.get(39.1)

In [64]:
pdf2[catcols].applymap(lambda x :catnums.get(x)).head()

Unnamed: 0,species,island,sex
0,0,0,0
1,0,0,1
2,0,0,1
3,0,0,0
4,0,0,1


In [65]:
pdf2.applymap?

In [66]:
pdf2.loc[:,catcols] = pdf2[catcols].applymap(lambda x :catnums.get(x))
pdf2.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,0,39.1,18.7,181.0,3750.0,0
1,0,0,39.5,17.4,186.0,3800.0,1
2,0,0,40.3,18.0,195.0,3250.0,1
3,0,0,43.92193,17.15117,200.915205,4201.754386,0
4,0,0,36.7,19.3,193.0,3450.0,1


In [67]:
pdf2.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

-- Extract Train and test dataset

In [68]:
X= pendf.iloc[:,:-1]
y= pendf.body_mass_g[:]

NameError: ignored

In [None]:


y[:5].values

In [None]:
y[:5].values.reshape(-1,1)

In [None]:

X[:5].values

In [None]:
X.shape,y.shape

In [None]:
X= pendf.iloc[:,:-1]
y= pendf.body_mass_g.values.reshape(-1,1)
X.shape,y.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.25, random_state=1234
)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

In [None]:
c = lr_model.intercept_

In [None]:
m = lr_model.coef_

In [None]:
c,m

In [None]:
X_train.columns

In [None]:
X_train.iloc[:1,:].values

In [None]:
y_train[:1]

In [None]:
m.shape,m.T,m.T.shape

In [None]:
X_train.iloc[:1,:].values*10

In [None]:
c+np.dot(X_train.iloc[:1,:].values,m.T)

In [None]:
y_learnt_mf = lr_model.predict(X_train)
y_learnt_mf[:5]

In [None]:
y_train[:5]

In [None]:
df22 = pd.DataFrame({'Actual': y_train[:,0],
                     'Learnt': y_learnt_mf[:,0]})
df22.head(10).plot(kind='bar')

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
mse_train_mf = mse(y_train , y_learnt_mf)
mse_train_mf

In [None]:
y_pred = lr_model.predict(X_test)
y_pred[:5]

In [None]:
mse_test_mf = mse(y_test,y_pred)
mse_test_mf

# R2 score

In [None]:
mse_base = 802.2336451400918**2
mse_base

In [None]:
1-(mse_test_mf/mse_base)

In [None]:
1-(rmse_test_sf/rmse_test_base)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score_mf = r2_score(y_test,y_pred)
r2_score_mf