# Difference between fit(), transform(), fit_transform(), predict()

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [29]:
data = pd.read_csv("titanic.csv", usecols=["Pclass", "Age", "Fare", "Survived"])
data.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [30]:
data.isnull().sum()

Survived      0
Pclass        0
Age         177
Fare          0
dtype: int64

In [31]:
data["Age"] = data["Age"].fillna(data["Age"].median())

In [32]:
data.isnull().sum()

Survived    0
Pclass      0
Age         0
Fare        0
dtype: int64

In [33]:
X = data[["Age", "Fare", "Pclass"]]
y = data["Survived"]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [35]:
X_train.shape, y_train.shape

((712, 3), (712,))

In [36]:
X_test.shape, y_test.shape

((179, 3), (179,))

In [37]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [38]:
scaler.transform(X_train)

array([[-0.94467808, -0.48147452,  0.8282276 ],
       [-0.10249532, -0.17081974,  0.8282276 ],
       [ 0.35687709, -0.38430515, -0.36791449],
       ...,
       [-0.33218153, -0.38430515, -0.36791449],
       [-0.79155394, -0.48283011,  0.8282276 ],
       [ 0.05062882, -0.38430515, -0.36791449]])

In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [40]:
X_train

array([[-0.94467808, -0.48147452,  0.8282276 ],
       [-0.10249532, -0.17081974,  0.8282276 ],
       [ 0.35687709, -0.38430515, -0.36791449],
       ...,
       [-0.33218153, -0.38430515, -0.36791449],
       [-0.79155394, -0.48283011,  0.8282276 ],
       [ 0.05062882, -0.38430515, -0.36791449]])

In [41]:
X_test = scaler.transform(X_test)

In [42]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

LogisticRegression()

In [43]:
preds = lr_model.predict(X_test)
preds

array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0], dtype=int64)

In [44]:
accuracy_score(y_test, preds)

0.7821229050279329