In [None]:
#!/usr/bin/env python3
"""sklearn_logistic_regression.ipynb
James Gardner 2019
with significant contributions from Matthew Alger

performs logistic regression on feature vectors
against positional matching labels using sklearn

must be run in directory containing patch_catalogue.csv
as output by feature_vectors.ipynb as well as manual_labels.csv

will save the following:
weights.csv, predictions.csv, objects.csv, multi_objects.csv,
torch_lr_losses.pdf, torch_lr_weights.pdf, torch_lr_predictions.pdf, torch_lr_partition.pdf
"""

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

import sklearn.linear_model
import sklearn.ensemble

In [None]:
catalogue = pd.read_csv('patch_catalogue.csv')
catalogue.set_index(['name_TGSS','name_NVSS'],inplace=True)

scores = catalogue['score']
# remove positions and label, we don't ask regression to recover separation
del (catalogue['ra_TGSS'],catalogue['dec_TGSS'],catalogue['ra_NVSS'],
     catalogue['dec_NVSS'],catalogue['score'])

# uncomment to see other importances when separation can't dominate
# del catalogue['separation']

catalogue['log_flux_TGSS']       = np.log10(catalogue['peak_TGSS'])
catalogue['log_integrated_TGSS'] = np.log10(catalogue['integrated_TGSS'])
catalogue['log_ratio_flux_TGSS'] = np.log10(catalogue['peak_TGSS']/catalogue['integrated_TGSS'])
catalogue['log_flux_NVSS']       = np.log10(catalogue['peak_NVSS'])

In [None]:
# train on half the catalogue, test classifier on all of it
labels = (scores.values > 0.1)
labels_A = labels[::2]

features = catalogue.values
features_A = features[::2]

In [None]:
# run logistic regression and compute accuracy
# here sklearn is way faster and more accurate compared to pytorch!
lr_model = sklearn.linear_model.LogisticRegression()
lr_model.fit(features_A,labels_A)
predictions = lr_model.predict(features)

accuracy = np.mean(predictions == labels)
print('over whole catalogue: accuracy =',accuracy)

In [None]:
# careful, these are the magnitude of the weights
weights = lr_model.coef_.copy()
weights /= weights.sum()

# create plot of all the 'weights'
plt.figure(figsize=(14,7))
plt.rcParams.update({'font.size': 18})
plt.bar(range(len(weights[0])),weights[0])
plt.xlabel('weights')
plt.xticks(range(len(weights[0])),catalogue.columns,rotation='vertical')
plt.ylabel('co-eff')
plt.title('sklearn logistic regression - importances')
plt.savefig('sklearn_lr.pdf',bbox_inches='tight') 

In [None]:
# compare against manual labels
manual_labels = pd.read_csv('manual_labels.csv')
manual_labels.set_index(['name_TGSS','name_NVSS'],inplace=True)
man_cat = catalogue.loc[manual_labels.index.values]

label_man = manual_labels['manual_label'].values
features_man = man_cat.values
pred_man = lr_model.predict(features_man)

accuracy = (pred_man == label_man).mean()
precision = (label_man[pred_man == True] == True).mean()
recall    = (pred_man[label_man == True] == True).mean() 
print(('on manual labels:\n accuracy = {0:.3f}, precision = {1:.3f}, recall = {2:.3f}')
      .format(accuracy,precision,recall))

In [None]:
# similar, but now using a random forest model
rf_model = sklearn.ensemble.RandomForestClassifier()
rf_model.fit(features[::2],labels[::2])

predictions = rf_model.predict(features[1::2])
accuracy = np.mean(predictions == labels[1::2])
print('over whole catalogue: accuracy =',accuracy)

rf_weights = rf_model.feature_importances_
rf_weights /= rf_weights.sum()

In [None]:
# create plot of random forest weights
plt.figure(figsize=(14,7))
plt.rcParams.update({'font.size': 18})
plt.bar(range(len(rf_weights)),rf_weights)
plt.xlabel('weights')
plt.xticks(range(len(rf_weights)),catalogue.columns,rotation='vertical')
plt.ylabel('co-eff')
plt.title('sklearn random forest - importances')
plt.savefig('sklearn_rf.pdf',bbox_inches='tight')