# Давайте посмотрим на данные

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns

%matplotlib inline

In [3]:
raw_data = pd.read_csv("data/Moscow_vdnh_dolgoprudniy.csv")
raw_data.head()

Unnamed: 0,STN---,WBAN,YEARMODA,TEMP,Unnamed: 5,DEWP,.1,SLP,.2,STP,...,WDSP,.5,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT,Unnamed: 22
0,276120,99999,19361231,16.0,4,9999.9,0,1022.9,4,9999.9,...,9.0,4,13.0,999.9,27.0*,5.0,0.00I,999.9,0,
1,276120,99999,19370101,33.5,4,9999.9,0,1020.9,4,9999.9,...,13.0,4,13.0,999.9,34.0*,18.0,99.99,999.9,10000,
2,276120,99999,19370102,28.8,4,9999.9,0,1020.4,4,9999.9,...,999.9,0,999.9,999.9,32.0,27.0*,0.00I,999.9,0,
3,276120,99999,19370103,29.0,4,9999.9,0,1013.2,4,9999.9,...,17.0,4,23.9,999.9,30.0,27.0,99.99,999.9,11000,
4,276120,99999,19370104,30.0,4,9999.9,0,1013.2,4,9999.9,...,13.3,4,18.1,999.9,32.0,28.0,99.99,999.9,1000,


In [30]:
# Praise the Documentation-sama!
_docs = '''STN---  1-6       Int.   Station number (WMO/DATSAV3 number)
                         for the location.

WBAN    8-12      Int.   WBAN number where applicable--this is the
                         historical "Weather Bureau Air Force Navy"
                         number - with WBAN being the acronym.

YEAR    15-18     Int.   The year.

MODA    19-22     Int.   The month and day.

TEMP    25-30     Real   Mean temperature for the day in degrees
                         Fahrenheit to tenths.  Missing = 9999.9
                       

Count   32-33     Int.   Number of observations used in 
                         calculating mean temperature.

DEWP    36-41     Real   Mean dew point for the day in degrees
                         Fahrenheit to tenths.  Missing = 9999.9
                         
Count   43-44     Int.   Number of observations used in 
                         calculating mean dew point.  

SLP     47-52     Real   Mean sea level pressure for the day
                         in millibars to tenths.  Missing =       
                         9999.9
Count   54-55     Int.   Number of observations used in 
                         calculating mean sea level pressure.

STP     58-63     Real   Mean station pressure for the day
                         in millibars to tenths.  Missing =       
                         9999.9
Count   65-66     Int.   Number of observations used in 
                         calculating mean station pressure.  

VISIB   69-73     Real   Mean visibility for the day in miles
                         to tenths.  Missing = 999.9
                         
Count   75-76     Int.   Number of observations used in 
                         calculating mean visibility.      

WDSP    79-83     Real   Mean wind speed for the day in knots
                         to tenths.  Missing = 999.9 
                        
Count   85-86     Int.   Number of observations used in 
                         calculating mean wind speed.

MXSPD   89-93     Real   Maximum sustained wind speed reported 
                         for the day in knots to tenths.
                         Missing = 999.9
                         

GUST    96-100    Real   Maximum wind gust reported for the day
                         in knots to tenths.  Missing = 999.9
                         

MAX     103-108   Real   Maximum temperature reported during the 
                         day in Fahrenheit to tenths--time of max 
                         temp report varies by country and        
                         region, so this will sometimes not be    
                         the max for the calendar day.  Missing = 
                         9999.9     
                         
Flag    109-109   Char   Blank indicates max temp was taken from the
                         explicit max temp report and not from the              
                         'hourly' data.  * indicates max temp was 
                         derived from the hourly data (i.e., highest
                         hourly or synoptic-reported temperature).

MIN     111-116   Real   Minimum temperature reported during the 
                         day in Fahrenheit to tenths--time of min 
                         temp report varies by country and        
                         region, so this will sometimes not be  
                         the min for the calendar day.  Missing = 
                         9999.9
                        
Flag    117-117   Char   Blank indicates min temp was taken from the
                         explicit min temp report and not from the              
                         'hourly' data.  * indicates min temp was 
                         derived from the hourly data (i.e., lowest
                         hourly or synoptic-reported temperature).

PRCP    119-123   Real   Total precipitation (rain and/or melted
                         snow) reported during the day in inches
                         and hundredths; will usually not end 
                         with the midnight observation--i.e., 
                         may include latter part of previous day.
                         .00 indicates no measurable              
                         precipitation (includes a trace).        
                         Missing = 99.99
                         Note:  Many stations do not report '0' on
                         days with no precipitation--therefore,  
                         '99.99' will often appear on these days.
                         Also, for example, a station may only
                         report a 6-hour amount for the period 
                         during which rain fell.
                         See Flag field for source of data.
Flag    124-124   Char   A = 1 report of 6-hour precipitation 
                             amount.
                         B = Summation of 2 reports of 6-hour 
                             precipitation amount.
                         C = Summation of 3 reports of 6-hour 
                             precipitation amount.
                         D = Summation of 4 reports of 6-hour 
                             precipitation amount.
                         E = 1 report of 12-hour precipitation
                             amount.
                         F = Summation of 2 reports of 12-hour
                             precipitation amount.
                         G = 1 report of 24-hour precipitation
                             amount.
                         H = Station reported '0' as the amount
                             for the day (eg, from 6-hour reports),
                             but also reported at least one
                             occurrence of precipitation in hourly
                             observations--this could indicate a
                             trace occurred, but should be considered
                             as incomplete data for the day.
                         I = Station did not report any precip data
                             for the day and did not report any
                             occurrences of precipitation in its hourly
                             observations--it's still possible that
                             precip occurred but was not reported.

SNDP    126-130   Real   Snow depth in inches to tenths--last     
                         report for the day if reported more than
                         once.  Missing = 999.9
                        Note:  Most stations do not report '0' on
                         days with no snow on the ground--therefore,
                         '999.9' will often appear on these days.

FRSHTT  133-138   Int.   Indicators (1 = yes, 0 = no/not          
                         reported) for the occurrence during the 
                         day of:
                         Fog ('F' - 1st digit).
                         Rain or Drizzle ('R' - 2nd digit).
                         Snow or Ice Pellets ('S' - 3rd digit).
                         Hail ('H' - 4th digit).
                         Thunder ('T' - 5th digit).
                         Tornado or Funnel Cloud ('T' - 6th       
                         digit).
'''

adeq_cols = list(filter(lambda s: len(s) > 0, map(lambda s: s[:s.find(' ')], _docs.split('\n'))))
for i in range(len(adeq_cols)):
    if adeq_cols[i] == 'Flag' or adeq_cols[i] == 'Count':
        adeq_cols[i] = adeq_cols[i - 1] + '_' + adeq_cols[i]

# некоторая подстава
adeq_cols = list(filter(lambda s: s[-4:] != 'Flag', adeq_cols))
adeq_cols[2:4] = ['DATE']

adeq_cols

['STN---',
 'WBAN',
 'DATE',
 'TEMP',
 'TEMP_Count',
 'DEWP',
 'DEWP_Count',
 'SLP',
 'SLP_Count',
 'STP',
 'STP_Count',
 'VISIB',
 'VISIB_Count',
 'WDSP',
 'WDSP_Count',
 'MXSPD',
 'GUST',
 'MAX',
 'MIN',
 'PRCP',
 'SNDP',
 'FRSHTT']

In [53]:
data = pd.DataFrame(raw_data.drop(columns=['Unnamed: 22']).values, columns=adeq_cols)

flagged = ['MAX', 'MIN', 'PRCP']
flags = {chr(ord('A') + i) for i in range(9)} | {'*'}

for name in flagged:
    data[name + '_Flag'] = data[name].map(lambda s: s[-1] if s[-1] in flags else '_')
    data[name] = data[name].map(lambda s: s[:-1] if s[-1] in flags else s)

# есть ещё кучи девяток вместо nan-ов
missing = [9999.9, np.nan, 9999.9, np.nan, 9999.9, np.nan, 9999.9,
           np.nan, 999.9, np.nan, 999.9, np.nan, 999.9, 999.9, 9999.9, 9999.9, 99.9, 999.9]
for miss, name in zip(missing, data.columns[3:]):
    data[name] = data[name].astype(float)
    data[name + '_is_missing'] = data[name].map(lambda s: int(s == miss))
    
# последняя волшебная колонка
FRSHTT = ['Fog', 'Rain or Drizzle', 'Snow or Ice Pellets', 'Hail', 'Thunder', 'Tornado or Funnel Cloud']
for i in range(len(FRSHTT)):
    data['Is_' + FRSHTT[i]] = data['FRSHTT'].map(lambda s: ((int(str(s), 2) >> (len(FRSHTT) - i - 1)) & 1))
data = data.drop(columns=['FRSHTT'])

# первые колонки нафиг не нужны, они полностью одинаковые
data = data.drop(columns=data.columns[:2])

data.head().T

Unnamed: 0,0,1,2,3,4
DATE,19361231,19370101,19370102,19370103,19370104
TEMP,16,33.5,28.8,29,30
TEMP_Count,4,4,4,4,4
DEWP,9999.9,9999.9,9999.9,9999.9,9999.9
DEWP_Count,0,0,0,0,0
SLP,1022.9,1020.9,1020.4,1013.2,1013.2
SLP_Count,4,4,4,4,4
STP,9999.9,9999.9,9999.9,9999.9,9999.9
STP_Count,0,0,0,0,0
VISIB,3,2.5,4,2.5,2.2


In [54]:
data.to_csv("data/Moscow_weather_preparsed.csv")