In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression

## Logistic Regression

### We used teacher data from Illinois to model the relationship between certain teacher characteristics and the likelihood of that teacher leaving their school from one year to the next. The final goal of this model will be to deploy it to a webpage where principals can input teacher characteristics and use the predictive model to see whether that particular teacher is likely to leave. 

### Once we've built the model we will extract the equation to build into our website. The website will take the user input, plug it into the model, and then predict the likelihood of a teacher with those characteristics leaving. 

### Load and clean data

In [3]:
# File to Load
CPS_path = "IL_TCHRS2.csv"

# Read the modified GoodReads csv and store into Pandas DataFrame
CPS_df = pd.read_csv(CPS_path, encoding="utf-8")
CPS_df

Unnamed: 0,Tchr_ID,Year,Position_Code,Position,Last_Name,First_name,Female,In State Experience,Out of State Experience,Total_exp,...,White,Latinx,First Year in Position,Salary,Sick Days,Vacation Days,Bonus,Annuities,Retirement Benefits,Other Benefits
0,105018,2017,999,Leave of Absence,Iazzetto,Sarah,1,9,,9,...,1,0,0,,,,,,,
1,117654,2017,999,Leave of Absence,Kindelin,Colleen,1,5,1.0,6,...,1,0,0,,,,,,,
2,129166,2017,999,Leave of Absence,ENGELHART,HEATHER,1,14,,14,...,1,0,0,,,,,,,
3,165364,2017,999,Leave of Absence,SEIDITA,LAURA,1,9,,9,...,1,0,0,,,,,,,
4,170969,2017,999,Leave of Absence,Henning,Sarah,1,6,,6,...,1,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141384,517730,2017,200,Teacher,Murray,Ranai,1,2,,2,...,1,0,1,,,,,,,
141385,517789,2017,200,Teacher,Smith,Elena,1,1,,1,...,0,1,1,,,,,,,
141386,517810,2017,200,Teacher,Foley,Seamus,0,1,,1,...,1,0,1,,,,,,,
141387,517819,2017,200,Teacher,Bergschneider,Kirstin,1,1,,1,...,1,0,1,,,,,,,


In [4]:
CPS_df["Out of State Experience"] = CPS_df["Out of State Experience"].fillna(0)
CPS_df

Unnamed: 0,Tchr_ID,Year,Position_Code,Position,Last_Name,First_name,Female,In State Experience,Out of State Experience,Total_exp,...,White,Latinx,First Year in Position,Salary,Sick Days,Vacation Days,Bonus,Annuities,Retirement Benefits,Other Benefits
0,105018,2017,999,Leave of Absence,Iazzetto,Sarah,1,9,0.0,9,...,1,0,0,,,,,,,
1,117654,2017,999,Leave of Absence,Kindelin,Colleen,1,5,1.0,6,...,1,0,0,,,,,,,
2,129166,2017,999,Leave of Absence,ENGELHART,HEATHER,1,14,0.0,14,...,1,0,0,,,,,,,
3,165364,2017,999,Leave of Absence,SEIDITA,LAURA,1,9,0.0,9,...,1,0,0,,,,,,,
4,170969,2017,999,Leave of Absence,Henning,Sarah,1,6,0.0,6,...,1,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141384,517730,2017,200,Teacher,Murray,Ranai,1,2,0.0,2,...,1,0,1,,,,,,,
141385,517789,2017,200,Teacher,Smith,Elena,1,1,0.0,1,...,0,1,1,,,,,,,
141386,517810,2017,200,Teacher,Foley,Seamus,0,1,0.0,1,...,1,0,1,,,,,,,
141387,517819,2017,200,Teacher,Bergschneider,Kirstin,1,1,0.0,1,...,1,0,1,,,,,,,


In [5]:
CPS_df.dtypes

Tchr_ID                      int64
Year                         int64
Position_Code                int64
Position                    object
Last_Name                   object
First_name                  object
Female                       int64
In State Experience          int64
Out of State Experience    float64
Total_exp                    int64
Dist_17                     object
Dist_Name_SY17              object
Dist_18                     object
Dist_Name_SY18              object
Left_Dist                    int64
PWL_ID_17                   object
PWL_name_17                 object
PWL_ID_18                   object
PWL_name_18                 object
Left_Schl                    int64
Race/Ethnicity              object
Black                        int64
White                        int64
Latinx                       int64
First Year in Position       int64
Salary                     float64
Sick Days                  float64
Vacation Days              float64
Bonus               

In [6]:
#Drop any rows with NA
CPS_df = CPS_df.dropna(axis='rows', how='any')
CPS_df

Unnamed: 0,Tchr_ID,Year,Position_Code,Position,Last_Name,First_name,Female,In State Experience,Out of State Experience,Total_exp,...,White,Latinx,First Year in Position,Salary,Sick Days,Vacation Days,Bonus,Annuities,Retirement Benefits,Other Benefits
43,312213,2017,200,Teacher,Galarza,Sarah,1,5,0.0,5,...,1,0,1,1674.20,0.0,0.0,0.0,0.0,0.00,23.77
45,371145,2017,200,Teacher,Duncan,Melissa,1,15,0.0,15,...,1,0,1,1752.00,14.0,0.0,0.0,0.0,0.00,0.00
64,485987,2017,208,Career and Technical Educator (CTE),NOLLER,KARLA,1,0,0.0,0,...,1,0,1,1926.44,12.0,3.0,0.0,0.0,254.07,27.90
65,417381,2017,200,Teacher,Bean,Shawn,0,18,0.0,18,...,1,0,1,1942.00,14.0,0.0,0.0,0.0,0.00,0.00
66,224362,2017,200,Teacher,Ipema,Justin,0,11,0.0,11,...,1,0,1,1954.54,15.0,0.0,0.0,0.0,193.30,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135261,90637,2017,125,Head of Gen Ed (Depart chair admin endorsement...,ONUSCHECK,MARK,0,16,0.0,16,...,1,0,0,171563.00,16.0,20.0,0.0,0.0,0.00,50223.91
135262,378712,2017,152,Special Education Director,DWYER,ROSANNE,1,5,0.0,5,...,1,0,0,173126.58,15.0,20.0,0.0,0.0,16153.20,31716.60
135263,311856,2017,200,Teacher,Reece,Janet,1,29,0.0,29,...,1,0,1,173637.84,15.0,0.0,0.0,0.0,0.00,14643.60
135269,343363,2017,200,Teacher,De Tineo,Julie,1,13,0.0,13,...,1,0,1,189554.11,10.0,15.0,0.0,0.0,25640.23,28060.99


In [7]:
# Subset larger dataset to what we need
CPS_df = CPS_df[["Total_exp", "Salary", "White","Left_Schl"]]
CPS_df

Unnamed: 0,Total_exp,Salary,White,Left_Schl
43,5,1674.20,1,0
45,15,1752.00,1,0
64,0,1926.44,1,1
65,18,1942.00,1,0
66,11,1954.54,1,0
...,...,...,...,...
135261,16,171563.00,1,0
135262,5,173126.58,1,0
135263,29,173637.84,1,0
135269,13,189554.11,1,0


In [8]:
max(CPS_df["Salary"])

213611.16

## Build logistic model

In [9]:
# Assign X (data) and y (target)
X = CPS_df.drop("Left_Schl", axis=1)
y = CPS_df["Left_Schl"]
print(X.shape, y.shape)

(118048, 3) (118048,)


### Train model with training data

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
# Fit the model to the training data and calculate the scores for the training and testing data
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Test the model

In [13]:
#Validate the model using the test data

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9148820818650041
Testing Data Score: 0.9128490105719708


In [14]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 1, 1, 0, 1, 0, 0, 0, 0, 0]


In [15]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,1
3,0,0
4,0,1
...,...,...
29507,0,0
29508,0,1
29509,0,0
29510,0,0


In [16]:
from sklearn.linear_model import LinearRegression

score = classifier.score(X_test, y_test)
print(f"R2 Score: {score}")

R2 Score: 0.9128490105719708


### Very high pearsons correlation coefficient means that the model is working well

In [17]:
coeff = pd.DataFrame({"Feature":X_train.columns.tolist(),"Coefficients":classifier.coef_[0]})
coeff

Unnamed: 0,Feature,Coefficients
0,Total_exp,-9.646139e-09
1,Salary,-4.106465e-05
2,White,-6.975822e-10


### Get the intercept and coefficients to build equation to put into Java

In [18]:
print('Weight coefficients: ', classifier.coef_)
print('y-axis intercept: ', classifier.intercept_)

Weight coefficients:  [[-9.64613933e-09 -4.10646526e-05 -6.97582151e-10]]
y-axis intercept:  [-7.34807076e-10]


## Our linear model now looks like this: 

y = -.000000000735 - .00000000964(Total_Experience) - .0000411(Salary) - .000000000698(White)

In [19]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 1, 1, 0, 1, 0, 0, 0, 0, 0]


In [20]:
X_test[:10]

Unnamed: 0,Total_exp,Salary,White
132236,24,118590.76,1
64161,11,57749.0,0
16791,4,39725.0,1
102368,11,78414.39,1
63874,1,57659.8,1
27153,7,43932.59,1
120998,18,91872.87,1
82687,14,66136.07,1
31898,5,45717.0,1
1275,10,13762.87,1


In [40]:
import math
def calc(Total_exp, Salary, White):
    cof = -.000000000735 - (.00000000964 * Total_exp) - (.0000411 * Salary) - (.000000000698 * White)
    per = 1 / (1 + math.exp(1 + cof))
    return f"The probabilty of this teacher leaving is {per*100:.2f}%"

In [41]:
calc(24, 118590.76, 1)

'The probabilty of this teacher leaving is 97.96%'