# Logistic Regression

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv', sep=",")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
age_mean = df['Age'].mean()
age_mean

29.69911764705882

In [5]:
age_median = df['Age'].median()
age_median

28.0

In [6]:
#df['Age'].fillna(df['Sex'].replace({'MALE':123, 'FEMALE':456}))

In [7]:
df['Age'] = df['Age'].fillna(28)

In [8]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df['Sex'] = df['Sex'].replace(['male'],'0')
df['Sex'] = df['Sex'].replace(['female'],'1')

  ### Select columns for y and X

In [10]:
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

### 3) Train-test-split


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.41, random_state=42)

In [12]:
X_train.shape, X_test.shape

((525, 3), (366, 3))

In [13]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.8057142857142857

### 8) Testing your model on the test data

In [14]:
model.score(X_test, y_test)

0.7868852459016393

### 9) Predict

In [15]:
new_passengers = np.array([[3, 0, 5], [1, 1, 50], [2, 1, 40], [3, 1, 60] ])
model.predict(new_passengers)

array([0, 1, 1, 0])

In [16]:
p = model.predict_proba(new_passengers).round(3)
pd.DataFrame(p, columns=['Died', 'Survived'], index=['John', 'Jane', 'Joe', 'Joanne'])

Unnamed: 0,Died,Survived
John,0.82,0.18
Jane,0.158,0.842
Joe,0.275,0.725
Joanne,0.66,0.34


### 11) Inspect the model parameters

In [17]:
model.coef_ 

array([[-1.01451478,  2.55283845, -0.03085446]])

In [18]:
model.intercept_ 

array([1.68031239])

In [19]:
x1 = 40
x2 = 13
w1 = 1.00987271
w2 = -1.77671059
w0 = -14.2462174

x = w1*x1 + w2*x2 + w0
p = 1 / (1+np.exp(-x))
p

0.9548452294655053

In [21]:
from sklearn.model_selection import cross_validate

model = LogisticRegression()
cv = cross_validate(model, X_train, y_train,
                    cv=5,
                    scoring='accuracy',
                    return_train_score=True
)
# convert the dictionary of lists into a DataFrame
cv = pd.DataFrame(cv)
print(cv)

   fit_time  score_time  test_score  train_score
0  0.064560    0.002280    0.904762     0.778571
1  0.045504    0.006457    0.780952     0.811905
2  0.014327    0.001778    0.723810     0.821429
3  0.030288    0.002377    0.742857     0.816667
4  0.012345    0.002195    0.847619     0.797619
