In [137]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
sns.set('notebook')

def transform_milk(data):
    return data[['animal_id','date','milk_weight']]

def transform_genetics(data):
    data = data.drop(axis=1, labels=['name', 'index', 'level_0', '%p', '%f' ])
    data = data.drop_duplicates(subset=['animal_id'], keep='last')
    return data.fillna(data.mean())[['animal_id', 'milk', 'ctpi']]
    
def transform_classification(data):
    data = data.drop(axis=1, labels=['index', 'age', 'date_calved', 'date', 'category'])
    data = data.drop_duplicates(subset=['animal_id'], keep='last')
    return data.fillna(data.mean())[['animal_id', 'final_score', 'dairy_form','udder_score_aggregate']]

def append_cow_age_to_milk_dataframe(milk, birthdates):
    """ Adds a cows age in month for each day milked"""
    milk = milk.reset_index()
    milk = pd.merge(left=milk, right=birthdates, left_on='animal_id', right_on='animal_id')
    milk['months_aged'] = (milk_with_ages['date'] - milk_with_ages['birthdate']).astype('timedelta64[M]')
    return milk

def retrieve_data():
    engine =  create_engine('sqlite:///../data/database/happycows.db')
    with engine.connect() as con:
        milk = pd.read_sql_table('milk_volume', con)
        birthdates = pd.read_sql_table('birthdates', con)
        genetics = pd.read_sql_table('genetics', con)
        classification = pd.read_sql_table('classification', con)
        
    return transform_milk(milk), birthdates, transform_genetics(genetics), transform_classification(classification)

def number_of_days_in_period(start, end):
    end = datetime.strptime(end, '%m-%d-%Y')
    start = datetime.strptime(start, '%m-%d-%Y')
    return (start - end).days

def build_performance_profiles_for_period(milk, birthdates, start='1-1-2016', end='12-31-2017'):
    period_milk = milk.set_index('date')[start:end]
    period_milk = pd.merge(left=period_milk, right=birthdates, left_on='animal_id', right_on='animal_id')
    
    days_in_period = number_of_days_in_period(start, end)
    
    profiles = []
    for animal_id in period_milk['animal_id'].unique():
        animal_milk_for_period = period_milk[period_milk['animal_id'] ==  animal_id]
        birthdates
        
        profile = {}
        profile['Animal ID'] = animal_id 
        profile['Days Milked'] = animal_milk_for_period.count()['animal_id']
        profile['Total Milk-Pounds'] = animal_milk_for_period['milk_weight'].sum()
        profile['Per Day Average'] = round(animal_milk_for_period['milk_weight'].mean(), 2)
        
        profile['Birthdate'] = animal_milk_for_period['birthdate'].values[0]
        
        profiles.append(profile)
        
    data = pd.DataFrame(profiles).set_index('Animal ID')
    return data.sort_values('Per Day Average', ascending=False)

def standardize_performance_profiles(profile):


In [138]:
milk, birthdates, genetics, classifications = retrieve_data()

In [139]:
performance_profiles = build_performance_profiles_for_period(milk, birthdates).sort_values('Per Day Average')
performance_profiles = performance_profiles[performance_profiles['Days Milked'] > 400].reset_index()
performance_profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 5 columns):
Animal ID            127 non-null int64
Birthdate            127 non-null datetime64[ns]
Days Milked          127 non-null int64
Per Day Average      127 non-null float64
Total Milk-Pounds    127 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 5.0 KB


In [140]:
genetics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177 entries, 0 to 176
Data columns (total 3 columns):
animal_id    177 non-null int64
milk         177 non-null int64
ctpi         177 non-null int64
dtypes: int64(3)
memory usage: 5.5 KB


In [141]:
classifications.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288 entries, 0 to 427
Data columns (total 4 columns):
animal_id                288 non-null float64
final_score              288 non-null float64
dairy_form               288 non-null float64
udder_score_aggregate    288 non-null float64
dtypes: float64(4)
memory usage: 11.2 KB


In [142]:
data = pd.merge(left=performance_profiles, right=genetics, left_on='Animal ID', right_on='animal_id').drop(axis=1, labels=['animal_id'])
data = pd.merge(left=data, right=classifications, left_on='Animal ID', right_on='animal_id').drop(axis=1, labels=['animal_id'])
data = data.sort_values('Total Milk-Pounds').reset_index().drop(axis=1, labels=['index', 'Birthdate'])

In [143]:
bot = data.head(13)
bot

Unnamed: 0,Animal ID,Days Milked,Per Day Average,Total Milk-Pounds,milk,ctpi,final_score,dairy_form,udder_score_aggregate
0,2009,432,57.43,24809.6,186,1901,83.0,30.0,83.0
1,2158,422,61.69,26032.6,32,1949,81.0,21.0,82.0
2,1946,402,65.49,26325.7,-492,1778,81.0,13.0,82.0
3,2157,438,62.62,27428.6,-203,1963,83.0,24.0,85.0
4,2049,412,71.96,29647.4,2,1635,80.0,42.0,80.0
5,2166,405,73.43,29740.4,772,2191,81.0,24.0,79.0
6,1832,454,66.73,30294.3,8,1532,83.0,33.0,83.0
7,1961,491,61.91,30398.7,-108,1417,82.0,24.0,82.0
8,2111,458,66.97,30670.8,297,2061,79.0,24.0,77.0
9,2152,426,73.09,31134.3,1402,2046,80.0,30.0,81.0


In [144]:
top = data.tail(13)
top

Unnamed: 0,Animal ID,Days Milked,Per Day Average,Total Milk-Pounds,milk,ctpi,final_score,dairy_form,udder_score_aggregate
103,2030,592,87.6,51856.6,1140,1981,83.0,35.0,83.0
104,2075,638,81.68,52113.2,498,1711,85.0,30.0,90.0
105,1985,611,85.78,52409.8,966,1941,78.0,26.0,80.0
106,2076,615,85.26,52433.0,906,1948,76.0,32.0,75.0
107,2043,614,85.77,52661.8,1132,2150,79.0,18.0,82.0
108,1918,585,91.3,53412.1,806,1785,84.0,30.0,86.0
109,1994,636,84.85,53963.3,885,1744,82.0,26.0,85.0
110,1815,628,85.96,53980.3,419,1629,85.0,30.0,85.0
111,1729,567,98.09,55615.7,-262,1696,85.0,25.0,85.0
112,2045,635,89.86,57059.9,539,1772,83.0,35.0,83.0


In [145]:
display_table = data.sample(20).set_index('Animal ID').rename(columns={"milk":"Genetics: Milk Score", 
                                                               "ctpi":"Genetics: CTPI",
                                                               "final_score":"Classification: Final Score",
                                                               "dairy_form":"Classfiication: Dairy Form",
                                                               "udder_score_aggregate":"Classification: Udder Score"})
display_table

Unnamed: 0_level_0,Days Milked,Per Day Average,Total Milk-Pounds,Genetics: Milk Score,Genetics: CTPI,Classification: Final Score,Classfiication: Dairy Form,Classification: Udder Score
Animal ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2095,548,64.48,35334.9,297,1821,81.069343,28.161172,81.270073
1933,544,70.12,38143.1,-530,1382,81.0,25.0,78.0
2063,467,83.16,38834.8,663,1800,79.0,28.0,77.0
2152,426,73.09,31134.3,1402,2046,80.0,30.0,81.0
2091,651,103.68,67492.6,1350,1744,79.0,39.0,79.0
2096,567,61.79,35037.6,-585,1756,85.0,26.0,86.0
2087,556,84.68,47081.3,903,1841,82.0,42.0,82.0
1965,437,73.65,32185.2,852,1689,83.0,26.0,83.0
2109,585,73.32,42894.6,430,2118,76.0,22.0,77.0
2117,433,85.37,36965.4,398,2158,83.0,36.0,86.0


In [None]:
### Tune Hyperparameters
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

k_values = [x for x in range(2,11)]
standardized_data = StandardScaler().fit_transform(data)

silhouette_averages = []
for k in k_values:
    clusterer = KMeans(n_clusters=k, random_state=45)
    cluster_labels = clusterer.fit_predict(standardized_data)
    silhouette_averages.append(silhouette_score(standardized_data, cluster_labels))

ax = plt.plot(k_values, silhouette_averages)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Average')
plt.title('Sum-of-Squares for k-clusters')
plt.savefig('figures/silhouette_score.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
animal_ids = data['Animal ID']

#your turn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
random_state = 25

standardized_data = StandardScaler().fit_transform(data)
pca_components = PCA(n_components=2, random_state=random_state).fit_transform(standardized_data)
cluster_labels = KMeans(n_clusters=5, random_state=random_state).fit_predict(data)
x = pca_components[:,0]
y = pca_components[:,1]

results = pd.DataFrame({'cluster_id':cluster_labels, 'x':x, 'y':y, 'animal_id':animal_ids})
results['Is in top 10%'] = results['animal_id'].isin(top['Animal ID'])
results['Is in bottom 10%'] = results['animal_id'].isin(bot['Animal ID'])
results = results.drop_duplicates(subset=['animal_id'], keep='first')

sns.lmplot('x', 'y', data=results, hue='cluster_id', fit_reg=False)
_ = plt.title('Animal Clusters: k-means (k=5)')
_ = plt.xlabel("")
_ = plt.ylabel("")
plt.savefig('figures/kmeans_clusters.png', dpi=300, bbox_inches='tight')
plt.show()

sns.lmplot('x', 'y', data=results, hue='Is in bottom 10%', fit_reg=False)
_ = plt.title('Location of Bottom 10% of Herd in PCA Analysis')
_ = plt.xlabel("")
_ = plt.ylabel("")
plt.savefig('figures/bottom_ten_pca.png', dpi=300, bbox_inches='tight')
plt.show()

In [151]:
cluster_profiles = pd.DataFrame()
cluster_profiles['Cluster Number'] = results.groupby('cluster_id').count().reset_index()['cluster_id']
cluster_profiles['Animal Count'] = results.groupby('cluster_id').count()['animal_id']
cluster_profiles['Count Top Performer'] = results.groupby('cluster_id').sum()['Is in top 10%']
cluster_profiles['Count Bottom Performer'] = results.groupby('cluster_id').sum()['Is in bottom 10%']
cluster_profiles['% of Cluster in Bottom'] = round(cluster_profiles['Count Bottom Performer'] / \
                                                  cluster_profiles['Animal Count'], 2)
cluster_profiles

Unnamed: 0,Cluster Number,Animal Count,Count Top Performer,Count Bottom Performer,% of Cluster in Bottom
0,0,16,0.0,13.0,0.81
1,1,40,0.0,0.0,0.0
2,2,43,0.0,0.0,0.0
3,3,3,3.0,0.0,0.0
4,4,14,10.0,0.0,0.0


In [154]:
r = results[['animal_id', 'cluster_id']]
d = pd.merge(left=r, right=data, left_on='animal_id', right_on='Animal ID')
d = d[d['cluster_id'].isin([0])]
#clustered_animals = results[results['cluster_id'].isin([3, 4])]['animal_id']
#data
d

Unnamed: 0,animal_id,cluster_id,Animal ID,Days Milked,Per Day Average,Total Milk-Pounds,milk,ctpi,final_score,dairy_form,udder_score_aggregate
0,2009,0,2009,432,57.43,24809.6,186,1901,83.0,30.0,83.0
1,2158,0,2158,422,61.69,26032.6,32,1949,81.0,21.0,82.0
2,1946,0,1946,402,65.49,26325.7,-492,1778,81.0,13.0,82.0
3,2157,0,2157,438,62.62,27428.6,-203,1963,83.0,24.0,85.0
4,2049,0,2049,412,71.96,29647.4,2,1635,80.0,42.0,80.0
5,2166,0,2166,405,73.43,29740.4,772,2191,81.0,24.0,79.0
6,1832,0,1832,454,66.73,30294.3,8,1532,83.0,33.0,83.0
7,1961,0,1961,491,61.91,30398.7,-108,1417,82.0,24.0,82.0
8,2111,0,2111,458,66.97,30670.8,297,2061,79.0,24.0,77.0
9,2152,0,2152,426,73.09,31134.3,1402,2046,80.0,30.0,81.0


In [None]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline
print(__doc__)

# Code source: Tyler Lanigan <tylerlanigan@gmail.com>
#              Sebastian Raschka <mail@sebastianraschka.com>

# License: BSD 3 clause

RANDOM_STATE = 42
FIG_SIZE = (10, 7)

data

# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)

# Fit to data and predict using pipelined GNB and PCA.
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

# Fit to data and predict using pipelined scaling, GNB and PCA.
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

print('\nPrediction accuracy for the standardized test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))

# Extract PCA from pipeline
pca = unscaled_clf.named_steps['pca']
pca_std = std_clf.named_steps['pca']

# Show first principal componenets
print('\nPC 1 without scaling:\n', pca.components_[0])
print('\nPC 1 with scaling:\n', pca_std.components_[0])

# Scale and use PCA on X_train data for visualization.
scaler = std_clf.named_steps['standardscaler']
X_train_std = pca_std.transform(scaler.transform(X_train))

# visualize standardized vs. untouched dataset with PCA performed
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)


for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

ax1.set_title('Training dataset after PCA')
ax2.set_title('Standardized training dataset after PCA')

for ax in (ax1, ax2):
    ax.set_xlabel('1st principal component')
    ax.set_ylabel('2nd principal component')
    ax.legend(loc='upper right')
    ax.grid()

plt.tight_layout()

plt.show()
Total running time of the script: ( 0 minutes 0.203 seconds)

Download Python source code: plot_scaling_importance.py
 
Download Jupyter notebook: plot_scaling_importance.ipynb
Generated by Sphinx-Gallery

© 2007 - 2017, scikit-learn developers (BSD License). Show this page source
Previous
Next