# Week08 Example 1 - Logistic Regression

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

## Load and Encode Data

Load data from csv file on server

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/timcsmith/MIS536-Public/master/Data/logit.csv")
df

Unnamed: 0,X,Y
0,0.5,Blue
1,1.1,Blue
2,1.5,Blue
3,2.0,Blue
4,3.3,Blue
5,4.7,Blue
6,5.3,Blue
7,7.0,Blue
8,6.5,Blue
9,7.5,Blue


Note that we need to encode our Y variable. 

In [4]:
encoder = LabelEncoder()
encoder.fit(df.Y)
df['Y']= 1-encoder.transform(df['Y'])

# OR, another way (and you get to control which color maps to 0 and 1)
#df.Y = df.Y.replace('Blue', 1, regex=True) 
#df.Y = df.Y.replace('Green', 0, regex=True) 


df

Unnamed: 0,X,Y
0,0.5,1
1,1.1,1
2,1.5,1
3,2.0,1
4,3.3,1
5,4.7,1
6,5.3,1
7,7.0,1
8,6.5,1
9,7.5,1


## Train Test Split

In [5]:
X_train,X_test,y_train,y_test=train_test_split(df.X,df.Y,test_size=.3,random_state=1)

In [6]:
# because there is only one feature in X, we need to convert it to a dataframe (or a array of arrays, but dataframe is easier here)
X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test)

In [7]:
logClassifier=LogisticRegression(random_state=1) # Create the model
_ = logClassifier.fit(X_train,y_train) # fit the model to training data. NOTE: underscore is a dummy variable that is useed to suppress output

## Measure performance of model on validation data

NOTE: This is a demonstration. There is no preference to any specific scoring metric.

In [8]:
pd.DataFrame({"predicted":logClassifier.predict(X_test),"actual":y_test})

Unnamed: 0,predicted,actual
20,0,0
17,0,0
3,1,1
13,0,1
19,0,0
16,0,0
10,1,0


In [9]:
y_pred = logClassifier.predict(X_test)
print("***********************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred):.3f}")
print("***********************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred):.3f}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred):.3f}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred):.3f}")
print("***********************")

***********************
Recall Score:     0.500
***********************
Accuracy Score:   0.714
Precision Score:  0.500
F1 Score:         0.500
***********************


## Deploy our model by using on new data

Create a dataframe containing new input data.

In [10]:
df_new = pd.DataFrame({"X":[1,2,3,4,5,6,7,8,9,10]}) # Create a new data frame with the values we want to predict
df_new # sdisplay new input data

Unnamed: 0,X
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


Predict y from new X data

In [11]:
df_new['Y_pred'] = logClassifier.predict(df_new[['X']]) # predict the values for the new data frame
df_new


Unnamed: 0,X,Y_pred
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,1
8,9,1
9,10,0


In [12]:
df_new.Y_pred = df_new.Y_pred.replace(1, 'Green') 
df_new.Y_pred = df_new.Y_pred.replace(0, 'Blue') 
df_new

Unnamed: 0,X,Y_pred
0,1,Green
1,2,Green
2,3,Green
3,4,Green
4,5,Green
5,6,Green
6,7,Green
7,8,Green
8,9,Green
9,10,Blue


Predict the probability of Green and Probabily of Blue for each value of X

In [13]:
logClassifier.predict_proba(df_new[['X']]).round(3)

array([[0.   , 1.   ],
       [0.001, 0.999],
       [0.002, 0.998],
       [0.005, 0.995],
       [0.014, 0.986],
       [0.039, 0.961],
       [0.103, 0.897],
       [0.244, 0.756],
       [0.477, 0.523],
       [0.721, 0.279]])