In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer

In [22]:
# koristimo train_timeseries kao dataset pošto sadrži dovoljan broj podataka
data = pd.read_csv("~/ip_timeseries/train_timeseries.csv")

In [23]:
# imamo jedan object: date
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19300680 entries, 0 to 19300679
Data columns (total 21 columns):
 #   Column       Dtype  
---  ------       -----  
 0   fips         int64  
 1   date         object 
 2   PRECTOT      float64
 3   PS           float64
 4   QV2M         float64
 5   T2M          float64
 6   T2MDEW       float64
 7   T2MWET       float64
 8   T2M_MAX      float64
 9   T2M_MIN      float64
 10  T2M_RANGE    float64
 11  TS           float64
 12  WS10M        float64
 13  WS10M_MAX    float64
 14  WS10M_MIN    float64
 15  WS10M_RANGE  float64
 16  WS50M        float64
 17  WS50M_MAX    float64
 18  WS50M_MIN    float64
 19  WS50M_RANGE  float64
 20  score        float64
dtypes: float64(19), int64(1), object(1)
memory usage: 3.0+ GB


In [24]:
# sadrži NaN vrednosti u score
data.isna().any()

fips           False
date           False
PRECTOT        False
PS             False
QV2M           False
T2M            False
T2MDEW         False
T2MWET         False
T2M_MAX        False
T2M_MIN        False
T2M_RANGE      False
TS             False
WS10M          False
WS10M_MAX      False
WS10M_MIN      False
WS10M_RANGE    False
WS50M          False
WS50M_MAX      False
WS50M_MIN      False
WS50M_RANGE    False
score           True
dtype: bool

In [25]:
# izbacujemo date
data = data.drop('date', axis = 1).reset_index(drop=True)

# T2M_RANGE, WS10M_RANGE i WS50M_RANGE su redundantne i
# mogu da se izbace pošto imamo i max i min za ove vrednosti
data = data.drop('T2M_RANGE', axis = 1)
data = data.drop('WS10M_RANGE', axis = 1)
data = data.drop('WS50M_RANGE', axis = 1)

# izbacujemo FIPS
data = data.drop('fips', axis = 1)

data.head()

Unnamed: 0,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,TS,WS10M,WS10M_MAX,WS10M_MIN,WS50M,WS50M_MAX,WS50M_MIN,score
0,0.22,100.51,9.65,14.74,13.51,13.51,20.96,11.46,14.65,2.2,2.94,1.49,4.85,6.04,3.23,
1,0.2,100.55,10.42,16.69,14.71,14.71,22.8,12.61,16.6,2.52,3.43,1.83,5.33,6.13,3.72,
2,3.65,100.15,11.76,18.49,16.52,16.52,22.73,15.32,18.41,4.03,5.33,2.66,7.53,9.52,5.87,
3,15.95,100.29,6.42,11.4,6.09,6.1,18.09,2.16,11.31,3.84,5.67,2.08,6.73,9.31,3.74,1.0
4,0.0,101.15,2.95,3.86,-3.29,-3.2,10.82,-2.66,2.65,1.6,2.5,0.52,2.94,4.85,0.65,


In [26]:
# formatiranje score-a
data = data[data['score'].notnull()].reset_index(drop=True)
data['score'] = data['score'].round().astype(int)

data

Unnamed: 0,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,TS,WS10M,WS10M_MAX,WS10M_MIN,WS50M,WS50M_MAX,WS50M_MIN,score
0,15.95,100.29,6.42,11.40,6.09,6.10,18.09,2.16,11.31,3.84,5.67,2.08,6.73,9.31,3.74,1
1,1.33,100.40,6.63,11.48,7.84,7.84,18.88,5.72,10.43,1.76,2.48,1.05,3.55,6.38,1.71,2
2,1.11,100.39,9.53,14.28,13.26,13.26,18.04,8.98,14.19,2.63,3.60,1.67,5.19,6.40,3.84,2
3,0.00,100.11,2.05,-0.78,-7.93,-7.72,5.65,-5.46,-0.61,3.35,4.59,2.28,5.75,8.03,3.96,2
4,0.00,101.00,3.36,2.06,-1.73,-1.70,11.02,-4.21,1.88,2.03,2.74,0.88,4.18,6.38,1.27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756791,0.04,82.47,2.19,-4.27,-9.36,-9.26,-1.51,-7.22,-4.38,6.93,10.27,3.08,9.42,12.59,5.62,0
2756792,0.13,82.59,0.91,-11.96,-18.65,-17.90,-6.85,-15.24,-11.98,2.27,3.97,0.95,3.07,5.16,0.83,0
2756793,0.02,82.86,1.40,-9.56,-14.18,-13.95,-2.69,-13.01,-10.95,3.64,6.59,1.28,5.33,8.35,2.13,0
2756794,0.32,83.12,2.73,-2.55,-6.90,-6.84,2.66,-7.77,-3.94,6.16,8.43,3.44,8.99,10.14,6.67,0


In [27]:
# izbacivanje autlajera
for i in data.columns:
    if i == 'score':
        continue
    data = data[(data[f'{i}'] <= data[f'{i}'].mean() + 3 * data[f'{i}'].std())
              & (data[f'{i}'] >= data[f'{i}'].mean() - 3 * data[f'{i}'].std())]

In [28]:
data

Unnamed: 0,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,TS,WS10M,WS10M_MAX,WS10M_MIN,WS50M,WS50M_MAX,WS50M_MIN,score
0,15.95,100.29,6.42,11.40,6.09,6.10,18.09,2.16,11.31,3.84,5.67,2.08,6.73,9.31,3.74,1
1,1.33,100.40,6.63,11.48,7.84,7.84,18.88,5.72,10.43,1.76,2.48,1.05,3.55,6.38,1.71,2
2,1.11,100.39,9.53,14.28,13.26,13.26,18.04,8.98,14.19,2.63,3.60,1.67,5.19,6.40,3.84,2
3,0.00,100.11,2.05,-0.78,-7.93,-7.72,5.65,-5.46,-0.61,3.35,4.59,2.28,5.75,8.03,3.96,2
4,0.00,101.00,3.36,2.06,-1.73,-1.70,11.02,-4.21,1.88,2.03,2.74,0.88,4.18,6.38,1.27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756791,0.04,82.47,2.19,-4.27,-9.36,-9.26,-1.51,-7.22,-4.38,6.93,10.27,3.08,9.42,12.59,5.62,0
2756792,0.13,82.59,0.91,-11.96,-18.65,-17.90,-6.85,-15.24,-11.98,2.27,3.97,0.95,3.07,5.16,0.83,0
2756793,0.02,82.86,1.40,-9.56,-14.18,-13.95,-2.69,-13.01,-10.95,3.64,6.59,1.28,5.33,8.35,2.13,0
2756794,0.32,83.12,2.73,-2.55,-6.90,-6.84,2.66,-7.77,-3.94,6.16,8.43,3.44,8.99,10.14,6.67,0


In [37]:
new_data = data.copy()

In [38]:
for i in data.columns:
    maximum = new_data[f"{i}"].max()
    minimum = new_data[f"{i}"].min()
    
    rangeOfCol = maximum - minimum
    width = int(rangeOfCol/5)
    
    min_value = int(np.floor(minimum))
    max_value = int(np.floor(maximum))
    
    intervals = [j for j in range(min_value, max_value + width, width)]
    
    new_data[f"{i}"] = pd.cut(x = new_data[f"{i}"], bins = intervals, include_lowest = True)
    
new_data

Unnamed: 0,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,TS,WS10M,WS10M_MAX,WS10M_MIN,WS50M,WS50M_MAX,WS50M_MIN,score
0,"(12.0, 16.0]","(100.0, 104.0]","(4.0, 8.0]","(2.0, 13.0]","(4.0, 13.0]","(5.0, 14.0]","(8.0, 20.0]","(-2.0, 9.0]","(4.0, 16.0]","(3.0, 4.0]","(4.0, 6.0]","(2.0, 3.0]","(6.0, 8.0]","(8.0, 10.0]","(3.0, 4.0]","(-0.001, 1.0]"
1,"(-0.001, 4.0]","(100.0, 104.0]","(4.0, 8.0]","(2.0, 13.0]","(4.0, 13.0]","(5.0, 14.0]","(8.0, 20.0]","(-2.0, 9.0]","(4.0, 16.0]","(1.0, 2.0]","(2.0, 4.0]","(1.0, 2.0]","(2.0, 4.0]","(6.0, 8.0]","(1.0, 2.0]","(1.0, 2.0]"
2,"(-0.001, 4.0]","(100.0, 104.0]","(8.0, 12.0]","(13.0, 24.0]","(13.0, 22.0]","(5.0, 14.0]","(8.0, 20.0]","(-2.0, 9.0]","(4.0, 16.0]","(2.0, 3.0]","(2.0, 4.0]","(1.0, 2.0]","(4.0, 6.0]","(6.0, 8.0]","(3.0, 4.0]","(1.0, 2.0]"
3,"(-0.001, 4.0]","(100.0, 104.0]","(-0.001, 4.0]","(-9.0, 2.0]","(-14.0, -5.0]","(-13.0, -4.0]","(-4.0, 8.0]","(-13.0, -2.0]","(-8.0, 4.0]","(3.0, 4.0]","(4.0, 6.0]","(2.0, 3.0]","(4.0, 6.0]","(8.0, 10.0]","(3.0, 4.0]","(1.0, 2.0]"
4,"(-0.001, 4.0]","(100.0, 104.0]","(-0.001, 4.0]","(2.0, 13.0]","(-5.0, 4.0]","(-4.0, 5.0]","(8.0, 20.0]","(-13.0, -2.0]","(-8.0, 4.0]","(2.0, 3.0]","(2.0, 4.0]","(-0.001, 1.0]","(4.0, 6.0]","(6.0, 8.0]","(1.0, 2.0]","(-0.001, 1.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756791,"(-0.001, 4.0]","(79.999, 84.0]","(-0.001, 4.0]","(-9.0, 2.0]","(-14.0, -5.0]","(-13.0, -4.0]","(-4.0, 8.0]","(-13.0, -2.0]","(-8.0, 4.0]","(6.0, 7.0]","(10.0, 12.0]","(3.0, 4.0]","(8.0, 10.0]","(12.0, 14.0]","(5.0, 6.0]","(-0.001, 1.0]"
2756792,"(-0.001, 4.0]","(79.999, 84.0]","(-0.001, 4.0]","(-20.001, -9.0]","(-23.001, -14.0]","(-22.001, -13.0]","(-16.001, -4.0]","(-24.001, -13.0]","(-20.001, -8.0]","(2.0, 3.0]","(2.0, 4.0]","(-0.001, 1.0]","(2.0, 4.0]","(4.0, 6.0]","(-0.001, 1.0]","(-0.001, 1.0]"
2756793,"(-0.001, 4.0]","(79.999, 84.0]","(-0.001, 4.0]","(-20.001, -9.0]","(-23.001, -14.0]","(-22.001, -13.0]","(-4.0, 8.0]","(-24.001, -13.0]","(-20.001, -8.0]","(3.0, 4.0]","(6.0, 8.0]","(1.0, 2.0]","(4.0, 6.0]","(8.0, 10.0]","(2.0, 3.0]","(-0.001, 1.0]"
2756794,"(-0.001, 4.0]","(79.999, 84.0]","(-0.001, 4.0]","(-9.0, 2.0]","(-14.0, -5.0]","(-13.0, -4.0]","(-4.0, 8.0]","(-13.0, -2.0]","(-8.0, 4.0]","(6.0, 7.0]","(8.0, 10.0]","(3.0, 4.0]","(8.0, 10.0]","(10.0, 12.0]","(6.0, 7.0]","(-0.001, 1.0]"


In [39]:
new_data.to_csv("association_rules_file.csv")