In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from datetime import date
import re
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
# get the Data
def get_data(path, displ=False):
    ## import data
    df = pd.read_csv(path, parse_dates=['date'], infer_datetime_format=True)

    # sort data
    df = df.sort_values(by=["number_sta", "date"])

    # set number_sta as category
    df["number_sta"] = df["number_sta"].astype("category")
    if displ:
        display("df :", df.head())

    return df

In [3]:
def get_observations(x, displ=False):
    ## shift X
    #get the observation baseline
    obs = x[{"number_sta", "date", "precip"}]
    # obs.set_index('date', inplace=True)

    #if any NaN on the day, then the value is NaN (24 values per day)
    # obs = obs.groupby('number_sta').resample('D')#.agg(pd.Series.sum, min_count = 24)
    obs['date'] = obs['date'].astype('category')
    obs['number_sta'] = obs['number_sta'].astype('category')
    obs['baseline_obs'] = obs.groupby(['number_sta'])['precip'].shift(1)

    obs = obs.sort_values(by=["number_sta", "date"])
    del obs['precip']
    obs = obs.rename(columns={'baseline_obs': 'precip'})
    # obs_new = obs.reset_index()

    if displ:
        display(obs)

    return obs

In [4]:
def clean_data(df, to_be_droped=["Id"], remove_na=False, displ=False):
    """
    Resolve NaN values, Remove unexplotable features.
    : return : df : DataFrames, no NaN
    """
    # remove nan, id and number_sta for now
    for key in to_be_droped:
        try:
            df = df.drop(key, axis=1)
        except KeyError:
            print("Warning : ", "'" + key + "'", "not found in df's columns.")

    #remove row with all Na
    if remove_na:
        if type(remove_na) == str :
            df = df.dropna(axis=0, how=remove_na)
        else :
            df = df.dropna(axis=0, how="all")

    if displ:
        display("df :", df.head(5))

    return df

In [5]:
x_df = get_data(path='../Train/Train/X_station_train.csv').dropna(how="any")
y_df = get_data(path='../Train/Train/Y_train.csv')

In [6]:
obs = get_observations(x_df)

In [7]:
x_clean = clean_data(x_df, ["date", "Id"])
x_df = clean_data(x_df, ["Id"])
obs_clean = clean_data(obs, ["date"])
display(x_df)

Unnamed: 0,number_sta,date,ff,t,td,hu,dd,precip
0,14066001,2016-01-01 00:00:00,3.05,279.28,277.97,91.4,200.0,0.0
1,14066001,2016-01-01 01:00:00,2.57,278.76,277.45,91.4,190.0,0.0
2,14066001,2016-01-01 02:00:00,2.26,278.27,277.02,91.7,181.0,0.0
3,14066001,2016-01-01 03:00:00,2.62,277.98,276.95,93.0,159.0,0.0
4,14066001,2016-01-01 04:00:00,2.99,277.32,276.72,95.9,171.0,0.0
...,...,...,...,...,...,...,...,...
4409469,95690001,2017-12-30 19:00:00,9.10,286.68,283.44,80.8,239.0,0.0
4409470,95690001,2017-12-30 20:00:00,8.58,286.39,283.21,81.1,231.0,0.0
4409471,95690001,2017-12-30 21:00:00,8.74,286.28,283.40,82.6,226.0,0.0
4409472,95690001,2017-12-30 22:00:00,9.04,286.21,283.29,82.4,224.0,0.0


In [8]:
obs_clean.reset_index(drop=True, inplace=True)
x_clean.reset_index(drop=True, inplace=True)
x_df.reset_index(drop=True, inplace=True)
na_ids = np.where(pd.isnull(obs_clean).any(1))[0]

In [9]:
obs_clean.dropna(inplace=True)
x_clean.drop(index=na_ids, inplace=True)
x_df.drop(index=na_ids, inplace=True)

obs_clean.reset_index(drop=True, inplace=True)
x_clean.reset_index(drop=True, inplace=True)
x_df.reset_index(drop=True, inplace=True)

display(obs_clean)
display(x_clean)
display(x_df)

Unnamed: 0,number_sta,precip
0,14066001,0.0
1,14066001,0.0
2,14066001,0.0
3,14066001,0.0
4,14066001,0.0
...,...,...
2263960,95690001,0.0
2263961,95690001,0.0
2263962,95690001,0.0
2263963,95690001,0.0


Unnamed: 0,number_sta,ff,t,td,hu,dd,precip
0,14066001,2.57,278.76,277.45,91.4,190.0,0.0
1,14066001,2.26,278.27,277.02,91.7,181.0,0.0
2,14066001,2.62,277.98,276.95,93.0,159.0,0.0
3,14066001,2.99,277.32,276.72,95.9,171.0,0.0
4,14066001,2.50,277.18,276.83,97.5,164.0,0.0
...,...,...,...,...,...,...,...
2263960,95690001,9.10,286.68,283.44,80.8,239.0,0.0
2263961,95690001,8.58,286.39,283.21,81.1,231.0,0.0
2263962,95690001,8.74,286.28,283.40,82.6,226.0,0.0
2263963,95690001,9.04,286.21,283.29,82.4,224.0,0.0


Unnamed: 0,number_sta,date,ff,t,td,hu,dd,precip
0,14066001,2016-01-01 01:00:00,2.57,278.76,277.45,91.4,190.0,0.0
1,14066001,2016-01-01 02:00:00,2.26,278.27,277.02,91.7,181.0,0.0
2,14066001,2016-01-01 03:00:00,2.62,277.98,276.95,93.0,159.0,0.0
3,14066001,2016-01-01 04:00:00,2.99,277.32,276.72,95.9,171.0,0.0
4,14066001,2016-01-01 05:00:00,2.50,277.18,276.83,97.5,164.0,0.0
...,...,...,...,...,...,...,...,...
2263960,95690001,2017-12-30 19:00:00,9.10,286.68,283.44,80.8,239.0,0.0
2263961,95690001,2017-12-30 20:00:00,8.58,286.39,283.21,81.1,231.0,0.0
2263962,95690001,2017-12-30 21:00:00,8.74,286.28,283.40,82.6,226.0,0.0
2263963,95690001,2017-12-30 22:00:00,9.04,286.21,283.29,82.4,224.0,0.0


In [10]:
# obs_clean.set_index("number_sta", inplace=True)
# obs_clean = obs_clean["precip"]
# x_clean.reset_index(drop=True, inplace=True)
# x_clean.set_index("number_sta", inplace=True)

In [11]:
# linear regression
lin_reg = LinearRegression()
lin_reg.fit(x_clean, obs_clean)
prediction = lin_reg.predict(x_clean)

In [12]:
prediction = pd.DataFrame(prediction, columns=["number_sta","precip"])
prediction.iloc[:,0] = np.array(prediction.iloc[:,0],dtype=int)
print(prediction)
# prediction

         number_sta    precip
0          14066001  0.062349
1          14066001  0.057333
2          14066001  0.067506
3          14066001  0.089148
4          14066001  0.088908
...             ...       ...
2263960    95690000  0.131871
2263961    95690000  0.123784
2263962    95690000  0.131579
2263963    95690000  0.135426
2263964    95690000  0.124015

[2263965 rows x 2 columns]


In [13]:
prediction["date"] = x_df["date"]
print(prediction)

         number_sta    precip                date
0          14066001  0.062349 2016-01-01 01:00:00
1          14066001  0.057333 2016-01-01 02:00:00
2          14066001  0.067506 2016-01-01 03:00:00
3          14066001  0.089148 2016-01-01 04:00:00
4          14066001  0.088908 2016-01-01 05:00:00
...             ...       ...                 ...
2263960    95690000  0.131871 2017-12-30 19:00:00
2263961    95690000  0.123784 2017-12-30 20:00:00
2263962    95690000  0.131579 2017-12-30 21:00:00
2263963    95690000  0.135426 2017-12-30 22:00:00
2263964    95690000  0.124015 2017-12-30 23:00:00

[2263965 rows x 3 columns]


In [14]:
prediction.set_index('date',inplace = True)
display(prediction)

Unnamed: 0_level_0,number_sta,precip
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 01:00:00,14066001,0.062349
2016-01-01 02:00:00,14066001,0.057333
2016-01-01 03:00:00,14066001,0.067506
2016-01-01 04:00:00,14066001,0.089148
2016-01-01 05:00:00,14066001,0.088908
...,...,...
2017-12-30 19:00:00,95690000,0.131871
2017-12-30 20:00:00,95690000,0.123784
2017-12-30 21:00:00,95690000,0.131579
2017-12-30 22:00:00,95690000,0.135426


In [15]:
prediction_summed = prediction.groupby('number_sta').resample('D').agg(pd.Series.sum, min_count = 24)

In [16]:
display(prediction_summed)

Unnamed: 0_level_0,Unnamed: 1_level_0,number_sta,precip
number_sta,date,Unnamed: 2_level_1,Unnamed: 3_level_1
14066001,2016-01-01,,
14066001,2016-01-02,3.375840e+08,4.149822
14066001,2016-01-03,3.375840e+08,6.834120
14066001,2016-01-04,3.375840e+08,2.228985
14066001,2016-01-05,3.375840e+08,2.069809
...,...,...,...
95690000,2017-12-26,2.296560e+09,3.722518
95690000,2017-12-27,2.296560e+09,4.310331
95690000,2017-12-28,2.296560e+09,1.094156
95690000,2017-12-29,2.296560e+09,4.130509


In [17]:
p = pd.DataFrame(prediction_summed)
del p["number_sta"]
p = p.reset_index()

p["date"] = pd.to_datetime(p["date"])

p.columns = ['number_sta', 'date', 'precip']

p['number_sta'] = p['number_sta'].astype('category')

display(p)

Unnamed: 0,number_sta,date,precip
0,14066001,2016-01-01,
1,14066001,2016-01-02,4.149822
2,14066001,2016-01-03,6.834120
3,14066001,2016-01-04,2.228985
4,14066001,2016-01-05,2.069809
...,...,...,...
96596,95690000,2017-12-26,3.722518
96597,95690000,2017-12-27,4.310331
96598,95690000,2017-12-28,1.094156
96599,95690000,2017-12-29,4.130509


In [18]:
print(y_df)

             date number_sta  Ground_truth            Id
0      2016-01-02   14066001           3.4    14066001_0
249    2016-01-03   14066001          11.7    14066001_1
499    2016-01-04   14066001           0.6    14066001_2
749    2016-01-05   14066001           0.4    14066001_3
997    2016-01-06   14066001           3.0    14066001_4
...           ...        ...           ...           ...
182747 2017-12-27   95690001           3.2  95690001_725
182997 2017-12-28   95690001           0.0  95690001_726
183247 2017-12-29   95690001           4.4  95690001_727
183497 2017-12-30   95690001           5.4  95690001_728
183746 2017-12-31   95690001           1.2  95690001_729

[183747 rows x 4 columns]


In [19]:
y_df.isna().sum(axis=0)

date                0
number_sta          0
Ground_truth    21640
Id                  0
dtype: int64