In [1]:
import pandas as pd

In [23]:
df = pd.read_csv(r"D:\python\scikitlearn-ML\CSV file\titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
df.shape

(891, 12)

In [25]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Missing data
Missing Data can occur when no information is provided for one or more items or for a whole unit. Missing Data is a very big problem in real life scenario. Missing Data can also refer to as NA(Not Available) values in pandas. In DataFrame sometimes many datasets simply arrive with missing data, either because it exists and was not collected or it never existed. 

another options :

isnull()
notnull()
dropna()
fillna()
replace()
interpolate()

In [26]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Reject null values from embarked, then shape of data 889 by 4

In [28]:
df = df.loc[df.Embarked.notna() , ['Survived' , 'Sex' , 'Embarked' ,'Pclass']]

In [29]:
df.shape

(889, 4)

In [30]:
#no null values now
df.isna().sum()

Survived    0
Sex         0
Embarked    0
Pclass      0
dtype: int64

In [31]:
df.head()

Unnamed: 0,Survived,Sex,Embarked,Pclass
0,0,male,S,3
1,1,female,C,1
2,1,female,S,3
3,1,female,S,1
4,0,male,S,3


In [32]:
#use only 1 feature
X = df.loc[: , ['Pclass']]
y = df.Survived

In [34]:
print(X.shape)
print(y.shape)

(889, 1)
(889,)


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [36]:
logreg = LogisticRegression(solver = 'lbfgs')
cross_val_score(logreg , X ,y , cv = 5, scoring = 'accuracy').mean()

0.6783406335301212

In [40]:
y.value_counts()

0    549
1    340
Name: Survived, dtype: int64

In [39]:
y.value_counts(normalize=True)

0    0.617548
1    0.382452
Name: Survived, dtype: float64

# Encode Categorical Features :

In [41]:
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder(sparse=False)

In [43]:
one.fit_transform(df[['Sex']])

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

1st column female, 2nd column male

In [44]:
one.categories_

[array(['female', 'male'], dtype=object)]

column : C, Q, S

In [45]:
#1st 3 output : S, C, S
one.fit_transform(df[['Embarked']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [46]:
one.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [47]:
X = df.drop('Survived' , axis = 'columns')

In [48]:
X.head()

Unnamed: 0,Sex,Embarked,Pclass
0,male,S,3
1,female,C,1
2,female,S,3
3,female,S,1
4,male,S,3


# Column Transformation

In [49]:
from sklearn.compose import make_column_transformer

In [50]:
column_trans = make_column_transformer( (OneHotEncoder() , ['Sex' , 'Embarked']) , remainder = 'passthrough')

In [51]:
#Output : 1st 2 columns female & male from sex column , then 3 columns are C,Q,S from embarked , last column is Pclass
#Pclass is already numerical , so no need to OneHotEncoder
column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

# Pipeline between column trasformstion & Logistic Regression

In [52]:
from sklearn.pipeline import make_pipeline

In [53]:
pipe = make_pipeline(column_trans , logreg)

In [54]:
cross_val_score(pipe , X ,y ,cv = 5,scoring = 'accuracy').mean()

0.7727924839713071

In [55]:
X_new = X.sample(5 , random_state = 99)
X_new

Unnamed: 0,Sex,Embarked,Pclass
599,male,C,1
512,male,S,1
273,male,C,1
215,female,C,1
790,male,Q,3


In [56]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Sex', 'Embarked'])])),
                ('logisticregression', LogisticRegression())])

In [57]:
#This is just like model.predict , exception just with preprocesssing
pipe.predict(X_new)

array([1, 0, 1, 1, 0], dtype=int64)