# Deducing the concentration of PM2.5 based on AOD and weather, etc. with Machine Learning Models

> Note: This notebook is under development.

In [1]:
from osgeo import gdal

In [2]:
import sklearn

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
import math

In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [32]:
import csv

## Importing Raster Data
Raster Image I/O

In [6]:
class Grid(object):
    @staticmethod
    def read_img(_file):
        dataset = gdal.Open(_file)
        # 数据描述
        # print(dataset.GetDescription())

        # 图像的列数X与行数Y
        img_width = dataset.RasterXSize
        img_height = dataset.RasterYSize

        # 仿射矩阵
        img_geotrans = dataset.GetGeoTransform()

        # 投影
        img_proj = dataset.GetProjection()

        # 将数据写成数组，对应栅格矩阵
        img_data = dataset.ReadAsArray(0, 0, img_width, img_height)

        # 数据格式大小
        # print(img_data.shape)

        del dataset
        return img_data, img_proj, img_geotrans

    @staticmethod
    def write_img(_file, img_data, img_proj, img_geotrans, _format):
        # 判断栅格数据的数据类型
        if 'int8' in img_data.dtype.name:
            datatype = gdal.GDT_Byte
        elif 'int16' in img_data.dtype.name:
            datatype = gdal.GDT_UInt16
        else:
            datatype = gdal.GDT_Float32

        # 判读数组维数
        if len(img_data.shape) == 3:
            img_bands, img_height, img_width = img_data.shape
        else:
            img_bands, (img_height, img_width) = 1, img_data.shape

        # 创建文件
        # HFA -> .img | GTiff -> .tif
        if _format == 'tif':
            driver = gdal.GetDriverByName("GTiff")
        else:
            driver = gdal.GetDriverByName("HFA")

        dataset = driver.Create(_file, img_width, img_height, img_bands, datatype)

        # 写入仿射变换参数
        dataset.SetGeoTransform(img_geotrans)
        # 写入投影
        dataset.SetProjection(img_proj)
        # 写入数组数据
        # GetRasterBand()
        if img_bands == 1:
            dataset.GetRasterBand(1).WriteArray(img_data)
        else:
            for i in range(img_bands):
                dataset.GetRasterBand(i + 1).WriteArray(img_data[i])

        del dataset

Specifying the extent

In [7]:
x_min = -1225790
x_max = 53952
y_min = -639572
y_max = 685124

In [None]:
img_templete, proj_templete, geotrans_templete = Grid.read_img("./img_data/dem.tif")
line_num, row_num = img_templete.shape

In [None]:
print(line_num, row_num)

In [None]:
img_intercept = img_templete.copy()
img_aod = img_templete.copy()
img_t = img_templete.copy()
img_p = img_templete.copy()
img_ws = img_templete.copy()
img_rh = img_templete.copy()
img_dem = img_templete.copy()
img_ndvi = img_templete.copy()

In [None]:
DEM = img_templete
NDVI, proj, trans = Grid.read_img("./img_data/ndvi.tif")
imageDictTemplate = {
    'Intercept': img_intercept,
    'AOD': img_aod,
    'T': img_t,
    'P': img_p,
    'WS': img_ws,
    'RH': img_rh
}

In [None]:
from sklearn.ensemble import RandomForestClassifier
def RandomForestRegressionDriver(imageDict, )
    global source_data, img_intercept, img_aod, img_t, 
        img_p, img_ws, img_rh, img_dem, img_ndvi, img_local_r,
        line_num, row_num
    

In [None]:
from scipy.stats.stats import pearsonr

In [None]:
help(pearsonr)

In [None]:
import csv
help(csv)

In [None]:
def ComputePearsonR():
    fileList = ['14-1', '14-4', '14-7', '15-1', '15-7', '16-1', '16-4', '16-7']
    for file in fileList:
        with open('./table/data-'+ file + '.csv', newline='') as csvfile:
            x_aod = []
            x_at = []
            x_dem = []
            x_rh = []
            x_pr = []
            y = []
            table = csv.DictReader(csvfile, delimiter=',', quotechar='|')
            for row in table:
                x_aod.append(float(row['AOD']))
                x_at.append(float(row['AirTemp']))
                x_dem.append(float(row['DEM']))
                x_rh.append(float(row['RH']))
                x_pr.append(float(row['SeaLevelPr']))
                y.append(float(row['pm2_5']))
            r1 = pearsonr(x_aod, y)
            r2 = pearsonr(x_at, y)
            r3 = pearsonr(x_dem, y)
            r4 = pearsonr(x_rh, y)
            r5 = pearsonr(x_pr, y)
            print(file, r1[1] < 0.05, r2[1] < 0.05, r3[1] < 0.05, r4[1] < 0.05, r5[1] < 0.05)

ComputePearsonR()

In [8]:
from scipy import stats

In [19]:
def readImageDataAsString(filePath):
    data = Grid.read_img(filePath)[0]
    shape = np.shape(data)
    array = data.ravel()
    print("Original Shape: \t" + str(shape))
    print("Flattened array size: \t" + str(np.shape(array)[0]))
    print("Array:\t" + str(array))
    return array, shape

In [23]:
ndviArray, shape = readImageDataAsString("./img_data/ndvi.tif")
demArray = readImageDataAsString("./img_data/dem.tif")[0]

Original Shape: 	(2649, 2559)
Flattened array size: 	6778791
Array:	[ 2158  1953  1982 ...  7987  6740 -3000]
Original Shape: 	(2649, 2559)
Flattened array size: 	6778791
Array:	[1458 1454 1458 ...  123   97  104]


In [24]:
print(ndviArray)
print(ndviArray[6778790])

[ 2158  1953  1982 ...  7987  6740 -3000]
-3000


In [25]:
def importAllData(date, hour):
    imageDict = {
        "aod": readImageDataAsString("./img_data/aod-" + str(date) + "-" + str(hour) + ".tif"),
        "t": readImageDataAsString("./img_data/t-" + str(date) + "-" + str(hour) + ".tif"),
        "rh": readImageDataAsString("./img_data/rh-" + str(date) + "-" + str(hour) + ".tif"),
        "p": readImageDataAsString("./img_data/p-" + str(date) + "-" + str(hour) + ".tif"),
        "ws": readImageDataAsString("./img_data/ws-" + str(date) + "-" + str(hour) + ".tif")
    }
    return imageDict

In [63]:
def loadSample(date, hour):       
    with open('./table/data-'+ str(date) + "-" + str(hour) + '.csv', newline='') as csvfile:
        x_aod = []
        x_at = []
        x_dem = []
        x_rh = []
        x_pr = []
        x_ndvi = []
        x_ws = []
        y = []
        table = csv.DictReader(csvfile, delimiter=',', quotechar='|')
        for row in table:
            x_aod.append(float(row['AOD']))
            x_at.append(float(row['AirTemp']))
            x_dem.append(float(row['DEM']))
            x_rh.append(float(row['RH']))
            x_pr.append(float(row['SeaLevelPr']))
            x_ndvi.append(float(row['NDVI']))
            x_ws.append(float(row['WindSpeed']))
            y.append(float(row['pm2_5']))
        xt = [np.ones(8), 
              np.array(x_ndvi), 
              np.array(x_dem), 
              np.array(x_aod), 
              np.array(x_at), 
              np.array(x_rh), 
              np.array(x_pr), 
              np.array(x_ws)]
        xt_np = np.mat(xt)
        x = np.transpose(xt_np)
        y_np = np.transpose(np.array(y))
        print(np.shape(x))
        print(x)
#         print(xt_np)
        print(y_np)
        return x, xt_np, y_np

In [64]:
loadSample(14,1)

(8, 1)
[[array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])]
 [array([[ 1581.],
       [ 2117.],
       [ 2512.],
       [ 2463.],
       [ 4064.],
       [ 2266.],
       [ 1942.],
       [ 1781.],
       [ 2406.],
       [ 2059.],
       [ 1786.],
       [ 2541.],
       [ 3105.],
       [ 2383.],
       [ 1797.],
       [ 2359.],
       [ 2243.],
       [ 1940.],
       [ 2539.],
       [ 1710.],
       [ 1845.],
       [ 1887.],
       [ 1557.],
       [ 1723.],
       [ 1882.],
       [ 1494.],
       [ 1170.],
       [ 1188.],
       [ 1069.],
       [ 1624.],
       [ 1705.],
       [ 1591.],
       [ 1325.],
       [ 1092.],
       [ 1236.],
       [ 1385.],
       [ 2091.],
       [ 1475.],
       [ 1899.],
       [ 1290.],
       [ 1999.],
       [ 1940.],
       [ 1558.],
       [ 1846.],
       [ 2147.],
       [ 2065.],
       [ 2390.],
       [ 2953.],
       [ 1918.],
       [ 3641.],
       [ 1658.],
       [ 1910.],

(matrix([[array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])],
         [array([[ 1581.],
        [ 2117.],
        [ 2512.],
        [ 2463.],
        [ 4064.],
        [ 2266.],
        [ 1942.],
        [ 1781.],
        [ 2406.],
        [ 2059.],
        [ 1786.],
        [ 2541.],
        [ 3105.],
        [ 2383.],
        [ 1797.],
        [ 2359.],
        [ 2243.],
        [ 1940.],
        [ 2539.],
        [ 1710.],
        [ 1845.],
        [ 1887.],
        [ 1557.],
        [ 1723.],
        [ 1882.],
        [ 1494.],
        [ 1170.],
        [ 1188.],
        [ 1069.],
        [ 1624.],
        [ 1705.],
        [ 1591.],
        [ 1325.],
        [ 1092.],
        [ 1236.],
        [ 1385.],
        [ 2091.],
        [ 1475.],
        [ 1899.],
        [ 1290.],
        [ 1999.],
        [ 1940.],
        [ 1558.],
        [ 1846.],
        [ 2147.],
        [ 2065.],
        [ 2390.],
        [ 2953.],
   

In [54]:
def OLS(date, hour):
    # regression
    x_s, xt_s, y_s = loadSample(date, hour)
    beta = np.matmul(xt_s, x_s)
    beta = np.linalg.inv(beta)
    beta = np.matmul(beta, np.xt_s)
    beta = np.matmul(y_s)
    print(beta)
    
    # prediction
    img = importAllData(date, hour)
    xt = [np.ones(6778791), ndviArray, demArray, img['aod'], img['t'], img['rh'], img['p'], img['ws']]
    x = np.transpose(xt)
    y = np.matmul(beta, x)
    
    print(y)
    return y

In [52]:
y_14_1 = OLS(14, 1)
print(y_14_1)

TypeError: ufunc 'matmul' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''