In [15]:
# Logistic Regression:
# Logistic regression aims to solve classification problems. It does this by predicting categorical
#  outcomes, unlike linear regression that predicts a continuous outcome.

# In the simplest case there are two outcomes, which is called binomial, an example of which is
# predicting if a tumor is malignant or benign. Other cases have more than two outcomes to classify, in this case it is called multinomial. A common example for multinomial logistic regression would be predicting the class of an iris flower between 3 different species.

# Here we will be using basic logistic regression to predict a binomial variable. This means it has
# only two possible outcomes.

In [16]:
# How does it work?
# In Python we have modules that will do the work for us. Start by importing the NumPy module.

import numpy

# Store the independent variables in X.

# Store the dependent variable in y.

# Below is a sample dataset:

# X represents the size of a tumor in centimeters.
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)

# NOTE: X has to be reshaped into a column from a row for the LogisticRegression() function to work.
# y represents whether or not the tumor is cancerous (0 for "No", 1 for "Yes").
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

# We will use a method from the sklearn module, so we will have to import that module as well:

from sklearn import linear_model

# From the sklearn module we will use the LogisticRegression() method to create a logistic regression object.

# This object has a method called fit() that takes the independent and dependent values as parameters and
# fills the regression object with data that describes the relationship:

logr = linear_model.LogisticRegression()
logr.fit(X,y)

# Now we have a logistic regression object that is ready to whether a tumor is cancerous based on
# the tumor size:

# predict if tumor is cancerous where the size is 3.46mm:
predicted = logr.predict(numpy.array([3.46]).reshape(-1,1))

In [17]:
# See the whole example in action:

import numpy
from sklearn import linear_model

#Reshaped for Logistic function.
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()
logr.fit(X,y)

#predict if tumor is cancerous where the size is 3.46mm:
predicted = logr.predict(numpy.array([3.46]).reshape(-1,1))
print(predicted)

[0]


In [18]:
# We have predicted that a tumor with a size of 3.46mm will not be cancerous.

In [19]:
# Coefficient:
# In logistic regression the coefficient is the expected change in log-odds of having the outcome
# per unit change in X.

# This does not have the most intuitive understanding so let's use it to create something that 
# makes more sense, odds.

# See the whole example in action:

import numpy
from sklearn import linear_model

# Reshaped for Logistic function.
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()
logr.fit(X,y)

log_odds = logr.coef_
odds = numpy.exp(log_odds)

print(odds)

[[4.03557295]]


In [20]:
# This tells us that as the size of a tumor increases by 1mm the odds of it being a 
# cancerous tumor increases by 4x.

In [21]:
# Probability:
# The coefficient and intercept values can be used to find the probability that each tumor is cancerous.

# Create a function that uses the model's coefficient and intercept values to return a new value.
# This new value represents probability that the given observation is a tumor:

def logit2prob(logr,x):
  log_odds = logr.coef_ * x + logr.intercept_
  odds = numpy.exp(log_odds)
  probability = odds / (1 + odds)
  return(probability)

In [None]:
# Function Explained:
# To find the log-odds for each observation, we must first create a formula that looks similar
# to the one from linear regression, extracting the coefficient and the intercept.

log_odds = logr.coef_ * x + logr.intercept_

# To then convert the log-odds to odds we must exponentiate the log-odds.

odds = numpy.exp(log_odds)

# Now that we have the odds, we can convert it to probability by dividing it by 1 plus the odds.

probability = odds / (1 + odds)

In [23]:
# Let us now use the function with what we have learned to find out the probability that 
# each tumor is cancerous.

# See the whole example in action:

import numpy
from sklearn import linear_model

X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69, 5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

logr = linear_model.LogisticRegression()
logr.fit(X,y)

def logit2prob(logr, X):
  log_odds = logr.coef_ * X + logr.intercept_
  odds = numpy.exp(log_odds)
  probability = odds / (1 + odds)
  return(probability)

print(logit2prob(logr, X))

[[0.60749168]
 [0.19267555]
 [0.12774788]
 [0.00955056]
 [0.08037781]
 [0.0734485 ]
 [0.88362857]
 [0.77901203]
 [0.88924534]
 [0.81293431]
 [0.57718238]
 [0.96664398]]


In [None]:
# Results Explained:
# 3.78, 0.61: The probability that a tumor with the size 3.78cm is cancerous is 61%.

# 2.44, 0.19: The probability that a tumor with the size 2.44cm is cancerous is 19%.

# 2.09, 0.13: The probability that a tumor with the size 2.09cm is cancerous is 13%.