In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X)

In [2]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
print('Accuracy', accuracy_score(y,y_pred))
print('Precision', precision_score(y,y_pred))
print('Recall', recall_score(y,y_pred))
print('F1',f1_score(y,y_pred))

Accuracy 0.8049605411499436
Precision 0.7734627831715211
Recall 0.6988304093567251
F1 0.7342549923195083


In [4]:
from sklearn.metrics import confusion_matrix

In [5]:
print(confusion_matrix(y,y_pred))

[[475  70]
 [103 239]]


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [8]:
print("whole dataset:", X.shape, y.shape)
print("training set:", X_train.shape, y_train.shape)
print("test set:", X_test.shape, y_test.shape)

whole dataset: (887, 6) (887,)
training set: (665, 6) (665,)
test set: (222, 6) (222,)


In [9]:
model.fit(X_train,y_train)

LogisticRegression()

In [10]:
print(model.score(X_test,y_test))

0.8063063063063063


In [11]:
y_pred = model.predict(X_test)
print("accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))

accuracy: 0.8063063063063063
precision: 0.8314606741573034
recall: 0.7254901960784313
f1 score: 0.7748691099476441


### We use a random_state parameter to ensure that we get the same random split every time the same code is run.

In [12]:
# X = [[1, 1], [2, 2], [3, 3], [4, 4]]
# y = [0, 0, 1, 1]

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)
# print('X_train', X_train)
# print('X_test', X_test)

In [13]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(y, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [887, 222]

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_recall_fscore_support

sensitivity_score = recall_score
def specificity_score(y_true, y_pred):
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
    return r[0]

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("sensitivity:", sensitivity_score(y_test, y_pred))
print("specificity:", specificity_score(y_test, y_pred))

In [None]:
# (model.predict_proba(X_test))


In [None]:
model.predict_proba(X_test)[:,1]

In [None]:
y_pred=model.predict_proba(X_test)[:,1]>0.75

In [None]:
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))

In [None]:
# import pandas as pd
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import precision_score, recall_score
# from sklearn.model_selection import train_test_split

# df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
# df['male'] = df['Sex'] == 'male'
# X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
# y = df['Survived'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y)

# model = LogisticRegression()
# model.fit(X_train, y_train)

# print("predict proba:")
# print(model.predict_proba(X_test))

# y_pred = model.predict_proba(X_test)[:, 1] > 0.75

# print("precision:", precision_score(y_test, y_pred))
# print("recall:", recall_score(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score
import matplotlib.pyplot as plt
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('1 - specificity')
plt.ylabel('sensitivity')
plt.show()

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1]) 

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male']=df['Sex'] == 'male'
X= df[['Pclass','male','Age','Siblings/Spouses','Parents/Children','Fare']].values
y=df['Survived'].values

X_train,X_test, y_train , y_test= train_test_split(X,y)

model1=LogisticRegression()
model1.fit(X_train,y_train)
y_pred_proba1= model1.predict_proba(X_test)
print('model 1 AUC score : ', roc_auc_score(y_test,y_pred_proba1[:,1]))


model2=LogisticRegression()
model2.fit(X_train[:,0:2],y_train)
y_pred_proba2 =model2.predict_proba(X_test[:,0:2])

print('model 2 AUC score : ', roc_auc_score(y_test, y_pred_proba2[:,1]))






In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

# building the model
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluating the model
y_pred = model.predict(X_test)
print(" accuracy: {0:.5f}".format(accuracy_score(y_test, y_pred)))
print("precision: {0:.5f}".format(precision_score(y_test, y_pred)))
print("   recall: {0:.5f}".format(recall_score(y_test, y_pred)))
print(" f1 score: {0:.5f}".format(f1_score(y_test, y_pred)))

In [None]:
X = df[['Age', 'Fare']].values[:6]
y = df['Survived'].values[:6]

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True)

In [None]:
list(kf.split(X))

In [None]:
from sklearn.model_selection import KFold
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
X = df[['Age', 'Fare']].values[:6]
y = df['Survived'].values[:6]

kf = KFold(n_splits=3, shuffle=True)
for train, test in kf.split(X):
    print(train, test)