# Basic Exploratory Data Analysis On Entire Dataset

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import mysql.connector
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from pandas.core import datetools
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans

sns.set_style('darkgrid')
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None

  from pandas.core import datetools


In [2]:
def create_connection():
    cnx2 = mysql.connector.connect(host='localhost',
                                   user='root', password='MyNewPass',
                                   database='Weather_Data')
    return cnx2

In [3]:
def select_data():
    conn = create_connection()
    cursor = conn.cursor()
    print('Extracting Data')
    query = "Select * from weather where Observation_date <= '2018-06-01 00:00:00';"
    cursor.execute(query, )
    result = cursor.fetchall()
    print('Extracted Data')
        
    cursor.close()
    conn.close()
    
    return result
    

In [4]:
def null_values(data):
    print(data.isnull().sum())

In [5]:
def change_structure(data):
    cols = ['date', 'rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir','height','latitude', 'longitude', 'station','county']
    data = pd.DataFrame(data, columns=cols)
    
    null_values(data)
    data.fillna(0, inplace=True)
    return data

In [6]:
def split_time(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year.astype(np.uint16)
    data['month'] = data['date'].dt.month.astype(np.uint8)
    data['day'] = data['date'].dt.day.astype(np.uint8)
    data['hour'] = data['date'].dt.hour.astype(np.uint8)
    
    return data

In [7]:
def aggregate_data(data):
    print('Aggregating Data')
    data = data.set_index('date').groupby(pd.Grouper(freq='H')).mean().dropna()
    
    print(len(data),' observations available for training')
    return data

In [8]:
data = select_data()
data = change_structure(data)
data = split_time(data)
data = aggregate_data(data)

Extracting Data
Extracted Data
date         0
rain         0
temp         0
wetb         0
dewpt        0
vappr        0
rhum         0
msl          0
wdsp         0
wddir        0
height       0
latitude     0
longitude    0
station      0
county       0
dtype: int64
Aggregating Data
266617  observations available for training


In [9]:
temp = data

In [10]:
grouping = ['year','month','day']
cols = ['rain']

temp = temp.groupby(grouping).agg({'rain': ['min', 'max']})
#print(temp)
#temp.columns = new_column_names
t = temp[temp[('rain','max')] >= .1]
t

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rain,rain
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max
year,month,day,Unnamed: 3_level_2,Unnamed: 4_level_2
1988,1,1,0.00,0.73
1988,1,2,0.09,0.82
1988,1,3,0.00,0.84
1988,1,4,0.00,0.67
1988,1,5,0.00,1.36
1988,1,6,0.00,0.96
1988,1,7,0.00,0.72
1988,1,8,0.00,0.84
1988,1,9,0.00,0.96
1988,1,10,0.00,0.42


In [16]:
temp[temp[('rain','max')] >= 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rain,rain
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max
year,month,day,Unnamed: 3_level_2,Unnamed: 4_level_2
1988,1,18,0.00,2.86
1988,7,12,0.00,2.40
1988,7,28,0.00,3.20
1988,8,8,0.00,2.68
1988,8,13,0.00,2.02
1988,8,31,0.00,2.22
1988,10,11,0.00,3.26
1988,10,21,0.16,2.03
1989,6,9,0.24,2.18
1989,8,14,0.00,2.12


In [17]:
temp = data
temp[(temp['month'] == 1) & (temp['year'] == 2018) & (temp['day'] == 24 )]

Unnamed: 0_level_0,rain,temp,wetb,dewpt,vappr,rhum,msl,wdsp,wddir,height,latitude,longitude,year,month,day,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-01-24 00:00:00,1.03,10.37,9.54,8.68,11.25,89.0,992.89,23.61,190.0,59.87,53.21,-8.2,2018,1,24,0
2018-01-24 01:00:00,1.85,10.34,9.48,8.61,11.21,88.7,990.91,24.09,199.57,59.87,53.21,-8.2,2018,1,24,1
2018-01-24 02:00:00,2.27,9.46,8.63,7.69,10.55,88.52,990.1,24.13,207.83,59.87,53.21,-8.2,2018,1,24,2
2018-01-24 03:00:00,1.84,8.64,7.6,6.33,9.64,85.17,990.2,23.87,219.57,59.87,53.21,-8.2,2018,1,24,3
2018-01-24 04:00:00,1.14,7.74,6.37,4.56,8.51,80.39,990.62,23.7,230.87,59.87,53.21,-8.2,2018,1,24,4
2018-01-24 05:00:00,0.59,7.17,5.68,3.66,7.96,78.17,991.17,22.0,232.61,59.87,53.21,-8.2,2018,1,24,5
2018-01-24 06:00:00,0.16,6.7,5.31,3.42,7.83,79.39,991.5,19.17,228.26,59.87,53.21,-8.2,2018,1,24,6
2018-01-24 07:00:00,0.1,6.78,5.4,3.56,7.9,79.65,991.75,17.83,225.22,59.87,53.21,-8.2,2018,1,24,7
2018-01-24 08:00:00,0.24,6.55,5.28,3.56,7.88,80.91,992.08,17.83,223.48,59.87,53.21,-8.2,2018,1,24,8
2018-01-24 09:00:00,0.2,6.69,5.33,3.46,7.83,79.61,992.61,17.22,223.91,59.87,53.21,-8.2,2018,1,24,9
