In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import mysql.connector
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from pandas.core import datetools
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans

sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

  from pandas.core import datetools


In [2]:
def create_connection():
    cnx2 = mysql.connector.connect(host='localhost',
                                   user='root', password='MyNewPass',
                                   database='Weather_Data')
    return cnx2

In [3]:
def select_data():
    conn = create_connection()
    cursor = conn.cursor()
    print('Extracting Data')
    query = "Select * from weather where Observation_date <= '2018-06-01 00:00:00';"
    cursor.execute(query, )
    result = cursor.fetchall()
    print('Extracted Data')
        
    cursor.close()
    conn.close()
    
    return result
    

In [4]:
def null_values(data):
    print(data.isnull().sum())

In [5]:
def change_structure(data):
    cols = ['date', 'rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir','height','latitude', 'longitude', 'station','county']
    data = pd.DataFrame(data, columns=cols)
    
    data.fillna(0, inplace=True)
    return data

In [6]:
def split_time(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year.astype(np.uint32)
    data['month'] = data['date'].dt.month.astype(np.uint8)
    data['day'] = data['date'].dt.day.astype(np.uint8)
    data['hour'] = data['date'].dt.hour.astype(np.uint8)
    
    return data

In [7]:
def generate_season(data):
    data['season'] = pd.cut(data['month'], bins=[0,1,4,7,10,14], labels=['Winter','Spring','Summer','Autumn','Winter2']).str.replace('Winter2','Winter')
    data['season'] = data['season'].astype('category')
    return data

In [8]:
def convert_wdsp(data):
    data['wdsp'] = data['wdsp'] * 1.852
    
    return data

In [9]:
def update_rhum_values(data):
    print('Updating relative humidity values <= 0')
    rhum_mean = data['rhum'].mean()
    data.loc[data['rhum'] <= 0, 'rhum'] = rhum_mean
    
    return data

In [10]:
def update_wetb_values(data):
    print('Updating wet bulb air temperature values <= 0')
    wetb_mean = data['wetb'].mean()
    data.loc[data['wetb'] <= -40, 'wetb'] = wetb_mean
    
    return data

In [11]:
def update_dewpt_values(data):
    print('Updating dew point air temperature values <= 0')
    dewpt_mean = data['dewpt'].mean()
    data.loc[data['dewpt'] <= -20,'dewpt'] = dewpt_mean
    
    return data

In [12]:
def update_msl_values(data):
    print('Updating mean sea level pressure values <= 0')
    msl_mean = data['msl'].mean() 
    data.loc[data['msl'] < 940, 'msl'] = msl_mean
    
    return data

In [13]:
def update_vappr_values(data):
    print('Updating vapour pressure values <= 0')
    vappr_mean = data['vappr'].mean()
    data.loc[data['vappr'] <= 0, 'vappr'] = vappr_mean
    
    return data

In [14]:
def binarize_categories(data):
    print('Binarizing categorical data')
    
    categorical_columns = []

    for col in data.columns:
        if isinstance(data[col][0], str):
            print('Changing ', col, ' to categorical')
            categorical_columns.append(col)
            encoder = LabelBinarizer()
            data[col] = encoder.fit_transform(data[col])
    
    data = delete_original_categories(categorical_columns, data)
        
    return data

In [15]:
def delete_original_categories(categorical_columns, data):
    print('Deleting original categories')
    for col in categorical_columns:
        if col in data.columns:
            del(data[col])
            gc.collect()
    return data

In [16]:
def get_seasons(data):   
    return data['season']

In [17]:
def basic_numeric_analysis(data):
    cols = ['rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir']
    raw_data = data[cols]
    print(pd.DataFrame(raw_data.describe()))

In [18]:
def get_cluster_centers(data, seasons):
    print('Generating cluster centers')
    data['season'] = seasons
    
    centers = []    
    weather_seasons = ['Winter','Summer','Spring','Autumn']
    
    for season in weather_seasons:
        centers.append(data[data['season']== season].mean(axis=0, numeric_only=True))
        
    return np.array(centers)
    

In [19]:
def get_categorical_data(data):
    print('Getting categorical data')
    categorical_data = data[['station','county','season','month']]
    return categorical_data

In [20]:
def normalise_data(data):
    print('Normalising data')
    data = data.reset_index()
    data.drop(columns=['date','station','county','season','index','height','longitude', 'latitude','year','month','day','hour'], inplace=True)
    column_names = data.columns
    scaler = StandardScaler()
    normalised_data = scaler.fit_transform(data)
    normalised_data = pd.DataFrame(normalised_data, columns=column_names)
    
    print(column_names)
    
    return [normalised_data,scaler,column_names]

In [21]:
def generate_clusters(data):
    data.drop(columns=['season'], inplace=True)
    clusters = KMeans(n_clusters=4, random_state=0).fit(data)
    return [clusters.cluster_centers_, clusters.labels_]

In [22]:
def generate_centered_clusters(data, centers):
    #data.drop(columns=['season'], inplace=True)
    clusters = KMeans(n_clusters=4, random_state=0, init=centers).fit(data)
    return [clusters.cluster_centers_, clusters.labels_]

In [23]:
def recombine(normalised_data, cluster_labels, categorical_data, scaler, column_names, centers):
    print(normalised_data.shape)
    data = scaler.inverse_transform(normalised_data)
    inverse_data = pd.DataFrame(data=data,
          index=np.array(range(0, 4039565)),
          columns=np.array(range(0, 9)))
    
    centers = scaler.inverse_transform(centers)
    inverse_centers = pd.DataFrame(data=centers,
          index=np.array(range(0, 4)),
          columns=np.array(range(0, 9)))
    
    inverse_centers.columns = column_names
    inverse_data.columns = column_names
    main_data = pd.merge(inverse_data, categorical_data, left_index=True, right_index=True)
    main_data['cluster'] = cluster_labels
    
 
    
    return [main_data,inverse_centers]
    

In [24]:
def cluster_statistics(data):
    centers = data[1]
    
    print('Cluster centers \n')
    print(centers, '\n')
    
    data = data[0]
    print(data['cluster'].value_counts(),'\n')
    
    cluster_0 = data[data['cluster']==0]
    cluster_1 = data[data['cluster']==1]
    cluster_2 = data[data['cluster']==2]
    cluster_3 = data[data['cluster']==3]

    print('****************** Cluster 0 Statistics *************************\n')
    print('Cluster 0 Seasons \n')
    print(cluster_0['season'].value_counts(), '\n')
    print('Cluster 0 Months \n')
    print(cluster_0['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_0)
    print('\n')
    
    print('****************** Cluster 1 Statistics *************************\n')
    print('Cluster 1 Seasons \n')
    print(cluster_1['season'].value_counts(), '\n')
    print('Cluster 1 Months \n')
    print(cluster_1['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_1)
    print('\n')
    
    print('****************** Cluster 2 Statistics *************************\n')
    print('Cluster 2 Seasons \n')
    print(cluster_2['season'].value_counts(), '\n')
    print('Cluster 2 Months \n')
    print(cluster_2['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_2)
    print('\n')
    
    print('****************** Cluster 3 Statistics *************************\n')
    print('Cluster 3 Seasons \n')
    print(cluster_3['season'].value_counts(), '\n')
    print('Cluster 3 Months \n')
    print(cluster_3['month'].value_counts(), '\n')
    basic_numeric_analysis(cluster_3)
    print('\n')

In [25]:
data = select_data()
data = change_structure(data)
data = split_time(data)
data = generate_season(data)
data = convert_wdsp(data)
#data = binarize_categories(data)

data = update_rhum_values(data)
data = update_wetb_values(data)
data = update_dewpt_values(data)
data = update_msl_values(data)
data = update_vappr_values(data)


Extracting Data
Extracted Data
Updating relative humidity values <= 0
Updating wet bulb air temperature values <= 0
Updating dew point air temperature values <= 0
Updating mean sea level pressure values <= 0
Updating vapour pressure values <= 0


In [26]:
categorical_data = get_categorical_data(data)
seasons = get_seasons(data)
normalised_data = normalise_data(data)
centers = get_cluster_centers(normalised_data[0], categorical_data['season'])

Getting categorical data
Normalising data
Index(['rain', 'temp', 'wetb', 'dewpt', 'vappr', 'rhum', 'msl', 'wdsp',
       'wddir'],
      dtype='object')
Generating cluster centers


In [27]:
%time cluster_data = generate_clusters(normalised_data[0])
%time cluster_data_centered = generate_centered_clusters(normalised_data[0], centers)

CPU times: user 1min 46s, sys: 26.1 s, total: 2min 12s
Wall time: 1min 54s


  return_n_iter=True)


CPU times: user 11.6 s, sys: 3.95 s, total: 15.6 s
Wall time: 15.6 s


In [28]:
data = recombine(normalised_data[0], cluster_data[1], categorical_data, normalised_data[1],normalised_data[2], cluster_data[0])

centered_data = recombine(normalised_data[0], cluster_data_centered[1], categorical_data, normalised_data[1],normalised_data[2], cluster_data_centered[0])

(4039565, 9)
(4039565, 9)


In [29]:
cluster_statistics(data)

Cluster centers 

   rain  temp  wetb  dewpt  vappr  rhum      msl  wdsp  wddir
0  0.04  4.70  3.88   2.63   7.56 86.83 1,016.81 11.69 176.26
1  0.07  9.44  7.72   5.64   9.25 77.95 1,008.02 27.02 232.63
2  2.33 10.14  9.71   9.26  11.93 94.10 1,000.44 25.76 183.55
3  0.06 13.83 12.47  11.28  13.45 85.28 1,015.27 15.66 193.98 

3    1670484
1    1127006
0    1123585
2     118490
Name: cluster, dtype: int64 

****************** Cluster 0 Statistics *************************

Cluster 0 Seasons 

Winter    485688
Spring    473410
Autumn     86877
Summer     77610
Name: season, dtype: int64 

Cluster 0 Months 

1     182468
12    178235
2     177193
3     172989
11    124985
4     123228
10     60610
5      57782
9      20539
6      15598
8       5728
7       4230
Name: month, dtype: int64 

              rain         temp         wetb        dewpt        vappr  \
count 1,123,585.00 1,123,585.00 1,123,585.00 1,123,585.00 1,123,585.00   
mean          0.04         4.70         3.88         

In [30]:
cluster_statistics(centered_data)

Cluster centers 

   rain  temp  wetb  dewpt  vappr  rhum      msl  wdsp  wddir
0  0.55  8.95  7.94   6.72  10.01 86.24   998.14 32.54 211.83
1  0.02 10.42  8.87   7.15  10.19 81.07 1,018.78 15.36 207.61
2  0.03  3.98  3.02   1.52   6.94 84.62 1,014.49 14.15 188.89
3  0.09 14.91 13.50  12.35  14.38 85.36 1,013.94 16.53 192.63 

1    1312963
3    1156733
2     955590
0     614279
Name: cluster, dtype: int64 

****************** Cluster 0 Statistics *************************

Cluster 0 Seasons 

Winter    256636
Spring    193775
Autumn     92866
Summer     71002
Name: season, dtype: int64 

Cluster 0 Months 

1     94197
12    82310
11    80129
2     70891
3     68244
10    60470
4     54640
5     42007
9     23472
6     19032
7      9963
8      8924
Name: month, dtype: int64 

            rain       temp       wetb      dewpt      vappr       rhum  \
count 614,279.00 614,279.00 614,279.00 614,279.00 614,279.00 614,279.00   
mean        0.55       8.96       7.94       6.73      10.01   