In [None]:
# Sample Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# load the iris datasets 
# http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
dataset = datasets.load_iris()
# fit a CART model to the data
model = DecisionTreeClassifier()
print(model)

In [None]:
model.fit(dataset.data, dataset.target)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
dataset #.data .target

<h2>Regression</h2>

Data:
Crime related data for small cities in the United States<br>
<li>X1 = total overall reported crime rate per 1 million residents
<li>X2 = reported violent crime rate per 100,000 residents
<li>X3 = annual police funding in $ per resident
<li>X4 = % of people 25 years+ with 4 yrs. of high school
<li>X5 = % of 16 to 19 year-olds not in highschool and not highschool graduates.
<li>X6 = % of 18 to 24 year-olds in college
<li>X7 = % of people 25 years+ with at least 4 years of college
<br>
(Reference: Life In America's Small Cities, By G.S. Thomas)


<h4>We are interested in identifying the drivers of violent crime in a city</h4>

<h4>Read the data</h4>

In [None]:
import pandas as pd
crime_data = pd.read_csv('Class 11 - Crime.csv', dtype=float)

In [None]:
crime_data

In [None]:
from sklearn import preprocessing
crime_data = crime_data.apply(lambda x: preprocessing.scale(x))
crime_data

<h4>Let's see if there is a relationship between total crime and violent crime</h4>


In [None]:
# X1 = total overall reported crime rate per 1 million residents
# X2 = reported violent crime rate per 100,000 residents

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure()
plt.title('Crime plot')
plt.xlabel('violent crime')
plt.ylabel('total crime')
plt.plot(crime_data['X2'], crime_data['X1'],'k.')
plt.axis([-3, 3, -3, 3])
plt.grid(True)
plt.show()

In [None]:
# Functionalize it (minus the axis limit)
def draw_scatter(x,y,title=None,x_label=None,y_label=None):
    import matplotlib.pyplot as plt
    %matplotlib inline

    plt.figure()
    plt.title(title)
    plt.xlabel(y_label)
    plt.ylabel(x_label)
    plt.plot(x, y, 'k.')
    plt.grid(True)
    plt.show()

In [None]:
draw_scatter(crime_data['X2'],
             crime_data['X1'],
             title='Crime Chart',
             x_label='violent',
             y_label='total')

<h4>Let's run a regression with violent crime as the dependent variable and total crime as the independent variable</h4>

<h4>First create a training and testing sample</h4>
We will use scikit-learn's train_test_split function for this

In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
train, test = sklearn.model_selection.train_test_split(crime_data, test_size = 0.3)
print(train)
print(test)

In [None]:
print(train['X2'].mean())
print(test['X2'].mean())

<h4>Now we can run the regression on the train sample</h4>
First, we need to reshape the two arrays into matrices. Sklearn works with matrices and not Pansdas Series

In [None]:
print(train['X1'])
print("----------------")
x_train=train['X1'].reshape(len(train['X1']),1)
y_train=train['X2'].reshape(len(train['X2']),1)
x_test=test['X1'].reshape(len(test['X1']),1)
y_test=test['X2'].reshape(len(test['X2']),1)
print(x_train)

In [None]:
from sklearn.linear_model import LinearRegression

# Create and fit the model
model = LinearRegression()
model.fit(x_train,y_train)

print(model.predict(1))
print(np.mean((model.predict(x_train) - y_train) ** 2))


In [None]:
model.predict(x_test)

<h4>Using the test sample</h4>


In [None]:
print('Train R-Square:',model.score(x_train,y_train))
print('Test R-Square:',model.score(x_test,y_test))

<h4>Let's see if police funding matters</h4>

In [None]:
# X1 = total overall reported crime rate per 1 million residents
# X2 = reported violent crime rate per 100,000 residents
# X3 = annual police funding in $ per resident

draw_scatter(crime_data['X2'],
             crime_data['X3'],
             title='Crime Chart',
             x_label='violent',
             y_label='funding')

<h4>Adding it to the regression</h4>

In [None]:
x_train = train[['X1','X3']]
y_train = train['X2']
x_test = test[['X1','X3']]
y_test = test['X2']

In [None]:
x_test

In [None]:
model = LinearRegression()
model.fit(x_train,y_train)
print('Train r-square',model.score(x_train,y_train))
print('Test r-square',model.score(x_test,y_test))
#Get predictions
predictions = model.predict(x_test)

In [None]:
predictions

In [None]:
i=0
for row in test.iterrows():
    #prediction = model.predict([row[1]['X1'],row[1]['X3']])
    prediction = predictions[i]
    i+=1
    actual = row[1]['X2']
    print(prediction,actual)
    

<h3>Polynomials in the regression</h3>
<h4>Perhaps a polynomial will fit the data better</h4>

In [None]:
draw_scatter(crime_data['X2'],
             crime_data['X1'],
             title='Crime Chart',
             x_label='violent',
             y_label='total')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x_train = train['X1'].reshape(len(train['X1']),1)
y_train = train['X2'].reshape(len(train['X2']),1)
x_test =  test['X1'].reshape(len(test['X1']),1)
y_test = test['X2'].reshape(len(test['X2']),1)
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(-3,3 , 10000) #Returns evenly spaced numbers
yy = regressor.predict(xx.reshape(xx.shape[0], 1))
fig = plt.figure(figsize=(12,12))
plt.plot(xx, yy)
quadratic_featurizer = PolynomialFeatures(degree=2)
x_train_quadratic = quadratic_featurizer.fit_transform(x_train) 
x_test_quadratic = quadratic_featurizer.transform(x_test)

regressor_quadratic = LinearRegression()
regressor_quadratic.fit(x_train_quadratic, y_train)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))

plt.plot(xx, regressor_quadratic.predict(xx_quadratic), c='r', linestyle='--')
plt.title('violent crime as a function of total crime')
plt.xlabel('total crime')
plt.ylabel('violent crime')
plt.grid(True)
plt.scatter(x_train, y_train)
plt.show()

print (x_train)
print (x_train_quadratic)
print (x_test)
print (x_test_quadratic)
print ('Simple linear regression r-squared', regressor.score(x_test, y_test))
print ('Quadratic regression r-squared', regressor_quadratic.score(x_test_quadratic, y_test))

In [None]:
# X2 = reported violent crime rate per 100,000 residents
# X7 = % of people 25 years+ with at least 4 years of college 

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x_train = train['X7'].reshape(len(train['X7']),1)
y_train = train['X2'].reshape(len(train['X2']),1)
x_test =  test['X7'].reshape(len(test['X7']),1)
y_test = test['X2'].reshape(len(test['X2']),1)
regressor = LinearRegression()
regressor.fit(x_train, y_train)
xx = np.linspace(-3,3 , 1000) #Returns 1000 evenly spaced numbers between 0 and 26
yy = regressor.predict(xx.reshape(xx.shape[0], 1))
fig = plt.figure(figsize=(12,12))
plt.plot(xx, yy)
quadratic_featurizer = PolynomialFeatures(degree=3)
x_train_quadratic = quadratic_featurizer.fit_transform(x_train) #Returns 1,x,x**2

x_test_quadratic = quadratic_featurizer.transform(x_test)

regressor_quadratic = LinearRegression()
regressor_quadratic.fit(x_train_quadratic, y_train)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))

plt.plot(xx, regressor_quadratic.predict(xx_quadratic), c='r', linestyle='--')
plt.title('violent crime as a function of total crime')
plt.xlabel('total crime')
plt.ylabel('violent crime')
plt.grid(True)
plt.scatter(x_train, y_train)
plt.show()

print (x_train)
print (x_train_quadratic)
print (x_test)
print (x_test_quadratic)
print ('Simple linear regression r-squared', regressor.score(x_test, y_test))
print ('Quadratic regression r-squared', regressor_quadratic.score(x_test_quadratic, y_test))


<h2>Decision Trees</h2>
Mobile phone usage data

In [None]:
phone_data=pd.read_csv('Class 11 - Phone usage.csv',index_col=0)
phone_data

In [None]:
from sklearn import preprocessing

phone_data_enc = phone_data.apply(preprocessing.LabelEncoder().fit_transform)
phone_data_enc
#Note that the labels may not match what you think!
#Usage: Low = 1, medium = 2, High = 0
#Education: College = 0, University = 2, High School = 1
#Marital: Married = 0, single = 1
#etc.

<h4>Create train/test samples</h4>

In [None]:
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
train, test = train_test_split(phone_data_enc, test_size = 0.3)
x_train = train[['income','age','education','marital']]
y_train = train[['usage']]
x_test = test[['income','age','education','marital']]
y_test = test[['usage']]
train

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

clf = DecisionTreeClassifier(max_depth=1,criterion="entropy") # construct a decision tree.
clf.fit(x_train,y_train)  # train it on the dataset

<h4>The score function gives a measure of the entropy in the model

In [None]:
print(clf.score(x_train,y_train))
print(clf.score(x_test,y_test))

<h4>Check predictions</h4>

In [None]:
predictions = clf.predict(x_test)
i=0
for row in y_test.iterrows():
    prediction = predictions[i]
    i+=1
    actual = row[1]['usage']
    print(prediction,actual)
    
print(list(y_test['usage']))
print(list(predictions))
metrics.accuracy_score(list(y_test['usage']), predictions)

<h4>Export to a graphviz file to visualize the tree</h4>

In [None]:
dotfile = tree.export_graphviz(clf.tree_, out_file='Class 11 - test.dot',
                               feature_names=['income','age','education','marital'])

In [None]:
#!pip install GraphViz 
#!pip install pydotplus 

In [None]:
import pydotplus 

dot_data = tree.export_graphviz(clf.tree_, out_file=None,
                                feature_names=['income','age','education','marital']) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("Class 11 - test.pdf") 

<h4>Expanding the tree</h4>

In [None]:
clf = DecisionTreeClassifier(max_depth=2,criterion="entropy") # construct a decision tree.
clf.fit(x_train,y_train)  # train it on the dataset

In [None]:
dotfile = tree.export_graphviz(clf.tree_, 
                               out_file='Class 11 - test2.dot',
                               feature_names=['income','age','education','marital'])

In [None]:
import pydotplus 
dot_data = tree.export_graphviz(clf.tree_, 
                                out_file=None,
                                feature_names=['income','age','education','marital']) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("Class 11 - test2.pdf") 

In [None]:
print(clf.score(x_train,y_train))
print(clf.score(x_test,y_test))

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf,
                phone_data_enc[['income','age','education','marital']],
                phone_data_enc['usage'])

In [None]:
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf, 
                              phone_data_enc[['income','age','education','marital']],
                              phone_data_enc['usage'])
metrics.accuracy_score(phone_data_enc['usage'], predicted) 
#print(list(phone_data_enc['usage']))
#print(list(predicted))