#### Importing libraries and getting the required columns from database


In [None]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import ttest_1samp
import getpass

password = getpass.getpass()

In [None]:
connection_string = 'mysql+pymysql://root:' + password + '@localhost/energy_db'
engine = create_engine(connection_string)

query = '''select district, duration, reignition
        from 
        fires_clean 
        '''

top5_districts = ['Viana do Castelo', 'Viseu', 'Bragança', 'Guarda', 'Vila Real']

dataset = pd.read_sql_query(query, engine)
dataset_top5 = dataset[dataset['district'].isin(top5_districts)]

#### Can Spain be energetically independent? 
+ H0: Yes, it can --> Total Load Actual <= Total Clean Energy Generated
+ H1: No, it can not --> Total Load Actual > Total Clean Energy Generated

#### Is it more cheap to produce and consume clean energy? 
+ H0: Yes, it is --> Price for Clean Energy <= Price for Fossil Energy
+ H1: No, it is not --> Price for Clean Energy > Price for Fossil Energy

#### Hypothesis testing

In [None]:
# One-tailed test:
# Null hypothesis or H0: mean number of re-ingitions in top 5 districts >= mean for portugal
# Alternative hyp or H1: mean number of re-ingitions in top 5 districts < mean for portugal

# we select a value for alpha of 0.05 (p-value threshold)
alpha=0.05

test = dataset_top5['reignition']
mean = dataset['reignition'].mean()

stat, pval = ttest_1samp(test, mean) # H0 

print('Stat: ', round(stat,2))
print('P-Val: ', round(pval,2),'\n')
if pval/2 < alpha:
    print('Mean number of re-ingitions in top 5 districts equal or higher than Portugal')

else:
    print('Mean number of re-ingitions in top 5 districts lower than Portugal')

In [None]:
# One-tailed test:
# Null hypothesis or H0: mean number of fires in top 5 districts >= mean for portugal
# Alternative hyp or H1: mean number of fires in top 5 districts < mean for portugal

# we select a value for alpha of 0.05 (p-value threshold)
alpha=0.05

test = dataset_top5['district'].value_counts()
mean = dataset['district'].value_counts().mean()

stat, pval = ttest_1samp(test, mean) # H0 

print('Stat: ', round(stat,2))
print('P-Val: ', round(pval,2),'\n')
if pval/2 < alpha:
    print('Mean number of fires in top 5 districts equal or higher than Portugal')
else:
    print('Mean number of fires in top 5 districts lower than Portugal')

#### Calculating mean values with confidence intervals


In [None]:
confidence_level = .95
degrees_freedom = len(dataset['duration']) - 1
sample_mean = np.mean(dataset['duration'])

sample_standard_error = scipy.stats.sem(dataset['duration']) 
confidence_interval = scipy.stats.t.interval(confidence_level, 
                                            degrees_freedom, 
                                            sample_mean, 
                                            sample_standard_error)

print( '95% confidence interval is ', round(confidence_interval[0],2),'-', round(confidence_interval[1],2))

In [None]:
confidence_level = .95
degrees_freedom = len(dataset_top5['duration']) - 1 
sample_mean = np.mean(dataset_top5['duration'])

sample_standard_error = scipy.stats.sem(dataset_top5['duration'])
confidence_interval = scipy.stats.t.interval(confidence_level, 
                                            degrees_freedom, 
                                            sample_mean, 
                                            sample_standard_error)

print( '95% confidence for top5 interval is ', round(confidence_interval[0],2),'-', round(confidence_interval[1],2))