In [1]:
import matplotlib

matplotlib.use('TkAgg')
from test_pipeline import test_pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors.kde import KernelDensity
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
import pandas
from feature_selection_pipeline import kruskal_wallis, select_k_best, ROC
from prediction_pipeline import kfold_cross_val_predictions, train_test_predictions
import copy

def categorize_data(data):
    labels = data['WindGustDir'].astype('category').cat.categories.tolist()
    col = ['WindGustDir', 'WindDir9am', 'WindDir3pm']

    for c in col:
        replace_map = {c: {k: v for k, v in zip(labels, list(range(1, len(labels) + 1)))}}
        data.replace(replace_map, inplace=True)

    labels = data['Location'].astype('category').cat.categories.tolist()
    replace_map = {'Location': {k: v for k, v in zip(labels, list(range(1, len(labels) + 1)))}}
    data.replace(replace_map, inplace=True)
    data['Location'].astype('category')

    return data

def get_preprocessed_data():
    # Load data set
    filename = 'weatherAUS.csv'
    data_raw = pandas.read_csv(filename)

    # Remove features that have more than 20% of missing values
    data_less_raw = data_raw.dropna(1, thresh=len(data_raw.index) * 0.8)

    # Remove examples that have any missing values
    data_less_raw = data_less_raw.dropna(0, how='any')

    # Remove RISK_MM
    data_less_raw = data_less_raw.drop(['RISK_MM'], axis=1)
    states = ["All"]
    states.extend(data_less_raw['Location'].unique())

    data = data_less_raw.copy()
    data['RainTomorrow'] = data['RainTomorrow'].map({'Yes': 1, 'No': 0})
    data['RainToday'] = data['RainToday'].map({'Yes': 1, 'No': 0})

    data = categorize_data(data)

    data_y = data['RainTomorrow'].ravel()
    data = data.drop(['Date', 'Location', 'RainTomorrow'], axis=1)

    return {'x': data, 'y': data_y}, states, len(data.columns)

kde1 = KernelDensity(kernel='gaussian')
kde2 = KernelDensity(kernel='gaussian')

In [2]:
data, states, num_columns = get_preprocessed_data()

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn.metrics import roc_auc_score
from statistics import mean


X_train, X_test, y_train, y_test = train_test_split(data['x'], data['y'], test_size=0.25, random_state=1)
no_indices = y_train == 0
yes_indices = y_train == 1
X_train[no_indices]

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday
89277,22.1,29.7,0.4,6,48.0,1,6,7.0,35.0,62.0,60.0,1013.9,1008.2,27.0,27.9,0
48396,-1.5,15.9,0.0,8,31.0,8,4,6.0,17.0,72.0,45.0,1022.0,1016.8,7.2,15.5,0
1196,10.0,14.2,0.8,15,35.0,14,14,17.0,15.0,84.0,79.0,1014.2,1013.5,10.3,11.2,0
58153,15.1,35.5,0.0,14,35.0,10,15,9.0,15.0,59.0,15.0,1010.8,1006.9,21.5,34.1,0
109671,10.7,32.3,0.0,9,35.0,2,10,13.0,17.0,40.0,28.0,1020.5,1017.2,23.2,30.8,0
77892,11.3,28.7,0.0,9,31.0,11,13,11.0,15.0,65.0,46.0,1024.7,1020.9,16.4,25.4,0
69327,10.2,20.4,0.0,13,46.0,14,13,15.0,26.0,79.0,40.0,1012.6,1012.4,14.5,18.8,0
2439,5.7,21.8,0.0,16,37.0,11,16,7.0,17.0,43.0,28.0,1017.7,1015.6,12.8,20.4,0
83316,20.8,28.1,0.4,2,19.0,15,2,2.0,9.0,79.0,73.0,1020.9,1017.3,24.7,26.4,0
57312,4.6,12.8,0.0,16,30.0,9,16,17.0,17.0,68.0,59.0,1025.9,1022.5,8.9,12.0,0


In [4]:
kde1.fit(X_train[no_indices], y_train[no_indices])
kde2.fit(X_train[yes_indices], y_train[yes_indices])
score_no = kde1.score_samples(X_test)
score_yes = kde2.score_samples(X_test)

In [5]:
classif = [0 if x > y else 1 for x, y in zip(score_no, score_yes)]


In [6]:
from sklearn.metrics import (roc_auc_score, accuracy_score, classification_report, confusion_matrix)
from visualizations import plot_confusion_matrix
report = classification_report(y_test, classif)
print(report)
print(accuracy_score(y_test, classif))

              precision    recall  f1-score   support

           0       0.83      0.60      0.70     21858
           1       0.30      0.58      0.39      6374

   micro avg       0.60      0.60      0.60     28232
   macro avg       0.56      0.59      0.55     28232
weighted avg       0.71      0.60      0.63     28232

0.5969467271181638


In [7]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(data['x'], data['y'])


KeyboardInterrupt: 