### Parameters

**wetb**: Wet bulb Air Temperature(°C) is used to measure the extent of cooling as moisture dries on a surface.

**dewpt**: Dew point Air Temperature(°C) is used to determine the temperature at which the air can no longer retain moisture. This figure should always be less than or equal to the value of the actual temperature. If air temperature cools to dew point, or the dew point rises to the current temperature then fog or clouds appear. Sourced: http://www.weatherquestions.com/What_is_dewpoint_temperature.htm

**vappr**: Vapour Pressure(hpa) 

**rhum**: Relative Humidity	describes how close the air is to saturation with moisture. Therefore a high relative humidity (100%) indicates theres more moisture in the air.


**msl**: Mean Sea Level Pressure is the atmospheric pressure recorded at sea.

#### Recurrent Neural Networks.
One of the best neural network architectures for predicting "the future", in this case future weather forecasts. Work quite well with time series data which is what I have here.

Suffer from vanishing/exploding gradients. LSTM and GRU cells used to combat this.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import mysql.connector
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from pandas.core import datetools
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.2f}'.format

  # Remove the CWD from sys.path while we load stuff.


In [2]:
def create_connection():
    cnx2 = mysql.connector.connect(host='localhost',
                                   user='root', password='MyNewPass',
                                   database='Weather_Data')
    return cnx2

In [3]:
def select_data():
    conn = create_connection()
    cursor = conn.cursor()
    print('Extracting Data')
    query = "Select * from weather;"
    cursor.execute(query, )
    result = cursor.fetchall()
    print('Extracted Data')
        
    cursor.close()
    conn.close()
    
    return result
    

In [4]:
def change_structure(data):
    cols = ['date', 'rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir','height','latitude', 'longitude', 'station','county']
    data = pd.DataFrame(data, columns=cols)
    
    data.fillna(0, inplace=True)
    return data

In [5]:
def split_time(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month.astype(np.uint8)
    data['day'] = data['date'].dt.day.astype(np.uint8)
    data['hour'] = data['date'].dt.hour.astype(np.uint8)
    
    return data

In [6]:
def generate_season(data):
    data['season'] = pd.cut(data['month'], bins=[2,5,8,11], labels=['Spring','Summer','Autumn','Winter'])
    return data

In [7]:
def mean_features(data):
    # mean rain, temp, windsp, windirection by year, by season
    group = ['year', 'season','month','station']
    features = ['rain', 'temp', 'wdsp', 'wddir']
    
    for grouping in group:
        for predictor in features:
            print('Grouping by ',grouping, ' getting average ', predictor )
            
            new_var = predictor + '_by_' + grouping
            temp = data.groupby(grouping)[predictor].mean().reset_index() 
            temp.columns = [grouping, new_var]
            data = data.merge(temp, how='left', on=grouping)
    
    return data

In [8]:
def plot_hour_vs_rain(data):    
    sns.lmplot(x='hour', y='rain', data = data, fit_reg=True, scatter_kws={"alpha":0.25, 's':.2}, line_kws={"color":"r"})
    plt.title('Hour vs. Mean Rainfall(mm) Including Regression Line')

In [9]:
def plot_hour_vs_temp(data):    
    sns.lmplot(x='hour', y='temp', data = data, fit_reg=True,scatter_kws={"alpha":0.25, 's':.2}, line_kws={"color":"r"})
    plt.title('Hour vs. Mean Temperature(°C) Including Regression Line')

In [10]:
def plot_hour_vs_wdsp(data):    
    sns.lmplot(x='hour', y='wdsp', data = data, fit_reg=True,scatter_kws={"alpha":0.25, 's':.2}, line_kws={"color":"r"})
    plt.title('Hour vs. Mean Wind Speed(KT) Including Regression Line')

In [11]:
def plot_by_year(data):
    data = data.sort_values('year')
    
    plt.figure(figsize=(15,15))
    plt.suptitle('Yearly Averages')
    
    plt.subplot(221)
    plt.plot('year', 'rain_by_year', data=data, color="orange")
    plt.xticks(np.arange(data['year'].min(), data['year'].max()) )
    plt.xlabel('Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Rainfall by Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(222)
    plt.plot('year', 'temp_by_year', data=data, color="green")
    plt.xticks(np.arange(data['year'].min(), data['year'].max()) )
    plt.xlabel('Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Temperature by Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(223)
    plt.plot('year', 'wdsp_by_year', data=data, color="blue")
    plt.xticks(np.arange(data['year'].min(), data['year'].max()) )
    plt.xlabel('Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Windspeed by Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(224)
    plt.plot('year', 'wddir_by_year', data=data, color="red")
    plt.xticks(np.arange(data['year'].min(), data['year'].max()) )
    plt.xlabel('Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Wind Direction by Year', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.savefig('Yearly Averages.png')

    
    plt.show()

In [12]:
def plot_by_season(data):
    data['season'].astype('category')
    data = data.sort_values('season')
    
    plt.figure(figsize=(15,15))
    plt.suptitle('Season Averages')
    
    plt.subplot(221)
    plt.bar('season','rain_by_season', data=data ,color="orange")
    plt.xlabel('Season', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Rainfall by Season', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(222)
    plt.bar('season', 'temp_by_season', data=data, color="green")
    plt.xlabel('Season', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Temperature by Season', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(223)
    plt.bar('season', 'wdsp_by_season', data=data, color="blue")
    plt.xlabel('Season', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Windspeed by Season', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(224)
    plt.bar('season', 'wddir_by_season', data=data, color="red")
    plt.xlabel('Season', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Wind Direction by Season', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.savefig('Seasonal Averages.png')
    
    plt.show()

In [13]:
def plot_by_station(data):
    data['station'].astype('category')
    data = data.sort_values('station')
    
    plt.figure(figsize=(18,18))
    plt.suptitle('Station Averages')
    
    plt.subplot(221)
    plt.plot('station', 'rain_by_station', data=data, color="orange")
    plt.xlabel('Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Rainfall by Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(222)
    plt.plot('station', 'temp_by_station', data=data, color="green")
    plt.xlabel('Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Temperature by Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(223)
    plt.plot('station', 'wdsp_by_station', data=data, color="blue")
    plt.xlabel('Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Windspeed by Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.subplot(224)
    plt.plot('station', 'wddir_by_station', data=data, color="red")
    plt.xlabel('Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Wind Direction by Station', fontweight='bold', color = 'orange', fontsize='14')
    plt.xticks(rotation=75)
    
    plt.savefig('Station Averages.png')
    
    plt.show()

In [14]:
def plot_by_month(data):
    data['month'].astype('category')
    data = data.sort_values('month')
    
    plt.figure(figsize=(15,15))
    plt.suptitle('Station Averages')
    
    plt.subplot(221)
    plt.plot('month', 'rain_by_month', data=data, color="orange")
    plt.xlabel('Month', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Rainfall by Month', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(222)
    plt.plot('month', 'temp_by_month', data=data, color="green")
    plt.xlabel('Month', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Temperature by Month', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(223)
    plt.plot('month', 'wdsp_by_month', data=data, color="blue")
    plt.xlabel('Month', fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Windspeed by Month', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.subplot(224)
    plt.plot('month', 'wddir_by_month', data=data, color="red")
    plt.xlabel('Month',fontweight='bold', color = 'orange', fontsize='14')
    plt.ylabel('Average Wind Direction by Month', fontweight='bold', color = 'orange', fontsize='14')
    
    plt.savefig('Monthly Averages.png')
    
    plt.show()

In [15]:
def create_sets(data):
    max_index = len(data)
    
    start_train = 0
    end_train = round(max_index//2)

    start_validation = end_train + 1
    end_validation = round(start_validation + (max_index - end_train) // 2)

    start_test = end_validation + 1
    end_test = max_index

    train = data[start_train:end_train]
    validation = data[start_validation:end_validation]
    test = data[start_test:end_test]
    
    return train, validation, test

In [16]:
def create_linear_models(data):
    data = data.sort_values('date')
    targets = ['wdsp','wddir','rain','temp']
    
    del(data['station'])
    del(data['county'])
    del(data['date'])
    
    for target in targets:
        temp = data
        
        #Scale data - MinMax scaler scales between 0 and 1. Means now have lowers standard deviations so effects of outliers are minimised
        #StandardScalers scales data so it has mean 0 and standard deviation of 1
        
        scaler = StandardScaler()
        temp = scaler.fit_transform(temp)

        # Split into train ,validation and test sets
        train, validation, test = create_sets(temp)
        temp = pd.DataFrame(temp)
       
        train_target, validation_target, test_target = create_sets(temp[32])

        del(temp[32])
        
        print('Performing linear regression on ',target)

        reg = LinearRegression().fit(train,train_target)
        target_predictions = reg.predict(validation)
        
        
        
        #print("Mean squared error: ", mean_squared_error(validation_target, target_predictions))
        print('Variance score: ', r2_score(validation_target, target_predictions))
        
        print('Performing Random forest regression on ',target)

        reg = RandomForestRegressor().fit(train,train_target)
        target_predictions = reg.predict(validation)
        
        
        
        print('Variance score: ', r2_score(validation_target, target_predictions))
        
        #print('Performing MLP regression on ',target)

        #reg = MLPRegressor().fit(train,train_target)
        #target_predictions = reg.predict(validation)
        
        
        
        #print('Variance score: ', r2_score(validation_target, target_predictions))
    
    

In [17]:
def get_vif(data):
    #https://etav.github.io/python/vif_factor_python.html
    
    data = data._get_numeric_data() 
    print(data.columns)
    targets = ['wdsp','temp','wddir','rain']
    for target in targets:
        features = '+'.join(data.loc[:, data.columns != target].columns)
        print('Performing Regression on: ', target)
        y, X = dmatrices(target + ' ~ ' + features, data, return_type='dataframe')
        print('Finished Regression on: ', target)
        
        #X represents the coefficients, y represents the output 
        
        
        #Cant calculate variance inflation factor as R2 is 1. Vif formula is 1/1-R2 which means its trying to divide by 0
        vif = pd.DataFrame()
        vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        vif["features"] = X.columns
        print(vif)
    

In [18]:
data = select_data()
data = change_structure(data)

Extracting Data
Extracted Data


In [19]:
data = split_time(data)

In [20]:
data = generate_season(data)

ValueError: Bin labels must be one fewer than the number of bin edges

In [None]:
data = mean_features(data)

In [None]:
%time plot_hour_vs_rain(data)

In [None]:
%time plot_hour_vs_temp(data)

In [None]:
%time plot_hour_vs_wdsp(data)

In [None]:
%time plot_by_year(data)

In [None]:
#%time plot_by_season(data)

In [None]:
%time plot_by_month(data)

In [None]:
%time plot_by_station(data)

In [None]:
#create_linear_models(data)

In [None]:
#get_vif(data)

In [None]:
plt.figure(figsize=(30,30))
plt.title('Feature Correlation', fontsize=20)
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(data.corr(),square=True, cmap=cmap, linewidths=.5, cbar_kws={"shrink": .5})
plt.savefig('Correlation.png')
plt.show()