In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import mysql.connector
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from pandas.core import datetools
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans

sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

  from pandas.core import datetools


In [2]:
def create_connection():
    cnx2 = mysql.connector.connect(host='localhost',
                                   user='root', password='MyNewPass',
                                   database='Weather_Data')
    return cnx2

In [3]:
def select_data():
    conn = create_connection()
    cursor = conn.cursor()
    print('Extracting Data')
    query = "Select * from weather where Observation_date <= '2018-06-01 00:00:00';"
    cursor.execute(query, )
    result = cursor.fetchall()
    print('Extracted Data')
        
    cursor.close()
    conn.close()
    
    return result
    

In [4]:
def null_values(data):
    print(data.isnull().sum())

In [5]:
def change_structure(data):
    cols = ['date', 'rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir','height','latitude', 'longitude', 'station','county']
    data = pd.DataFrame(data, columns=cols)
    
    data.fillna(0, inplace=True)
    return data

In [6]:
def split_time(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year.astype(np.uint32)
    data['month'] = data['date'].dt.month.astype(np.uint8)
    data['day'] = data['date'].dt.day.astype(np.uint8)
    data['hour'] = data['date'].dt.hour.astype(np.uint8)
    
    return data

In [7]:
def generate_season(data):
    data['season'] = pd.cut(data['month'], bins=[0,1,4,7,10,14], labels=['Winter','Spring','Summer','Autumn','Winter2']).str.replace('Winter2','Winter')
    data['season'] = data['season'].astype('category')
    return data

In [8]:
def convert_wdsp(data):
    data['wdsp'] = data['wdsp'] * 1.852
    
    return data

In [9]:
def update_rhum_values(data):
    print('Updating relative humidity values <= 0')
    rhum_mean = data['rhum'].mean()
    data.loc[data['rhum'] <= 0, 'rhum'] = rhum_mean
    
    return data

In [10]:
def update_wetb_values(data):
    print('Updating wet bulb air temperature values <= 0')
    wetb_mean = data['wetb'].mean()
    data.loc[data['wetb'] <= -40, 'wetb'] = wetb_mean
    
    return data

In [11]:
def update_dewpt_values(data):
    print('Updating dew point air temperature values <= 0')
    dewpt_mean = data['dewpt'].mean()
    data.loc[data['dewpt'] <= -20,'dewpt'] = dewpt_mean
    
    return data

In [12]:
def update_msl_values(data):
    print('Updating mean sea level pressure values <= 0')
    msl_mean = data['msl'].mean() 
    data.loc[data['msl'] < 940, 'msl'] = msl_mean
    
    return data

In [13]:
def update_vappr_values(data):
    print('Updating vapour pressure values <= 0')
    vappr_mean = data['vappr'].mean()
    data.loc[data['vappr'] <= 0, 'vappr'] = vappr_mean
    
    return data

In [14]:
def binarize_categories(data):
    print('Binarizing categorical data')
    
    categorical_columns = []

    for col in data.columns:
        if isinstance(data[col][0], str):
            print('Changing ', col, ' to categorical')
            categorical_columns.append(col)
            encoder = LabelBinarizer()
            data[col] = encoder.fit_transform(data[col])
    
    data = delete_original_categories(categorical_columns, data)
        
    return data

In [15]:
def delete_original_categories(categorical_columns, data):
    print('Deleting original categories')
    for col in categorical_columns:
        if col in data.columns:
            del(data[col])
            gc.collect()
    return data

In [16]:
def basic_numeric_analysis(data):
    cols = ['rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir']
    raw_data = data[cols]
    print(pd.DataFrame(raw_data.describe()))

In [17]:
def get_categorical_data(data):
    categorical_data = data[['station','county','season']]
    return categorical_data

In [18]:
def normalise_data(data):
    print('Normalising data')
    data = data.reset_index()
    data.drop(columns=['date','station','county','season'], inplace=True)
    column_names = data.columns
    scaler = StandardScaler()
    normalised_data = scaler.fit_transform(data)
    normalised_data = pd.DataFrame(normalised_data, columns=column_names)
    
    return [normalised_data,scaler,column_names]

In [19]:
def generate_clusters(data):
    clusters = KMeans(n_clusters=4, random_state=0).fit(data)
    return [clusters.cluster_centers_, clusters.labels_]

In [20]:
def recombine(normalised_data, cluster_labels, categorical_data, scaler, column_names):
    print(normalised_data.shape)
    data = scaler.inverse_transform(normalised_data)
    inverse_data = pd.DataFrame(data=data,
          index=np.array(range(0, 4039565)),
          columns=np.array(range(0, 17)))
    
    inverse_data.columns = column_names
    main_data = pd.merge(inverse_data, categorical_data, left_index=True, right_index=True)
    main_data['cluster'] = cluster_labels
    
    return main_data
    

In [30]:
def cluster_statistics(data):
    print(data['cluster'].value_counts(),'\n')
    
    cluster_0 = data[data['cluster']==0]
    cluster_1 = data[data['cluster']==1]
    cluster_2 = data[data['cluster']==2]
    cluster_3 = data[data['cluster']==3]

    print('****************** Cluster 0 Statistics *************************\n')
    print('Cluster 0 Seasons \n')
    print(cluster_0['season'].value_counts(), '\n')
    print('Cluster 0 Months \n')
    print(cluster_0['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_0)
    print('\n')
    
    print('****************** Cluster 1 Statistics *************************\n')
    print('Cluster 1 Seasons \n')
    print(cluster_1['season'].value_counts(), '\n')
    print('Cluster 1 Months \n')
    print(cluster_1['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_1)
    print('\n')
    
    print('****************** Cluster 2 Statistics *************************\n')
    print('Cluster 2 Seasons \n')
    print(cluster_2['season'].value_counts(), '\n')
    print('Cluster 2 Months \n')
    print(cluster_2['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_2)
    print('\n')
    
    print('****************** Cluster 3 Statistics *************************\n')
    print('Cluster 3 Seasons \n')
    print(cluster_3['season'].value_counts(), '\n')
    print('Cluster 3 Months \n')
    print(cluster_3['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_3)
    print('\n')

In [22]:
data = select_data()
data = change_structure(data)
data = split_time(data)
data = generate_season(data)
data = convert_wdsp(data)
#data = binarize_categories(data)

data = update_rhum_values(data)
data = update_wetb_values(data)
data = update_dewpt_values(data)
data = update_msl_values(data)
data = update_vappr_values(data)


Extracting Data
Extracted Data
Updating relative humidity values <= 0
Updating wet bulb air temperature values <= 0
Updating dew point air temperature values <= 0
Updating mean sea level pressure values <= 0
Updating vapour pressure values <= 0


In [23]:
categorical_data = get_categorical_data(data)
normalised_data = normalise_data(data)
%time cluster_data = generate_clusters(normalised_data[0])

Normalising data
CPU times: user 8min 50s, sys: 47 s, total: 9min 37s
Wall time: 3min 7s


In [24]:
data = recombine(normalised_data[0], cluster_data[1], categorical_data, normalised_data[1],normalised_data[2])

(4039565, 17)


In [31]:
cluster_statistics(data)

3    1224704
2    1120163
0     887594
1     807104
Name: cluster, dtype: int64 

****************** Cluster 0 Statistics *************************

Cluster 0 Seasons 

Autumn    324214
Summer    313045
Winter    139218
Spring    111117
Name: season, dtype: int64 

Cluster 0 Months 

8.00     115958
7.00     115188
9.00     109219
6.00     107645
10.00     99037
5.00      90212
11.00     63530
4.00      51754
12.00     49108
3.00      35919
1.00      26580
2.00      23444
Name: month, dtype: int64 

            rain       temp       wetb      dewpt      vappr       rhum  \
count 887,594.00 887,594.00 887,594.00 887,594.00 887,594.00 887,594.00   
mean        0.12      12.97      11.62      10.36      12.72      84.84   
std         0.51       3.02       2.54       2.72       2.36      11.38   
min        -0.00       0.00      -0.00       0.00       6.10      24.00   
25%        -0.00      10.70       9.70       8.30      10.90      79.00   
50%        -0.00      12.80      11.40      1