Imports

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Dataframe

In [108]:
def get_processed_df(path: str) -> pd.DataFrame:
    """read csv file and create dataframe
    Args:
      path: path to dataset
    Returns:
      Dataframe without NaN values and with the addition of the Fahrenheit temperature column
    """
    df = pd.read_csv(path, header=None)
    df.columns = ["Date",
                  "Value"]
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    if not ((df.isnull().sum()).eq(0).all()):
        df.dropna(inplace=True, ignore_index=True)
    curestd,curemedian = df['Value'].mean(),df['Value'].median()
    df['MedianDeviation'] = abs(curemedian-df['Value'])
    df['StdDeviation'] = abs(curestd-df['Value'])
    return df

In [109]:
df = get_processed_df("dataset.csv")

In [110]:
print(get_processed_df("dataset.csv"))

           Date    Value  MedianDeviation  StdDeviation
0    2008-01-10  24.4387          33.1753     27.304524
1    2008-01-11  24.4796          33.1344     27.263624
2    2008-01-12  24.3671          33.2469     27.376124
3    2008-01-15  24.2913          33.3227     27.451924
4    2008-01-16  24.2858          33.3282     27.457424
...         ...      ...              ...           ...
3932 2023-11-28  88.7045          31.0905     36.961276
3933 2023-11-29  88.6102          30.9962     36.866976
3934 2023-11-30  88.8841          31.2701     37.140876
3935 2023-12-01  88.5819          30.9679     36.838676
3936 2023-12-02  89.7619          32.1479     38.018676

[3937 rows x 4 columns]


Statistics

In [111]:
def get_statistical_info(df: pd.DataFrame, parametr: str) -> pd.Series:
    """Getting statistical information
    Args:
      df: Dataframe with original values
      parametr: column for statistic
    Returns:
      A series containing a statistical info
    """
    if parametr in df.columns:
        return df[parametr].describe()

In [112]:
print(get_statistical_info(df, "Value"))
print(get_statistical_info(df, "MedianDeviation"))
print(get_statistical_info(df, "StdDeviation"))


count    3937.000000
mean       51.743224
std        20.014501
min        23.125500
25%        31.179100
50%        57.614000
75%        66.330900
max       120.378500
Name: Value, dtype: float64
count    3937.000000
mean       17.763405
std        10.928693
min         0.000000
25%         7.196200
50%        18.951000
75%        26.791300
max        62.764500
Name: MedianDeviation, dtype: float64
count    3937.000000
mean       18.260302
std         8.188828
min         0.014176
25%        12.621976
50%        19.493876
75%        22.298024
max        68.635276
Name: StdDeviation, dtype: float64


Filtration

In [113]:
def std_deviation_filtration(df: pd.DataFrame, std_deviation: float) -> pd.DataFrame:
    """Filtering by column temperature in degrees Celsius
    Args:
      df: Dataframe with original values
      celsius_temp: temperature in degrees Celsius
    Returns:
      Dataframe with days in which the temperature is not less than the set temperature
    """
    return df[df["StdDeviation"] >= std_deviation]

In [115]:
print(std_deviation_filtration(df, 30))

           Date    Value  MedianDeviation  StdDeviation
1994 2016-01-22  83.5913          25.9773     31.848076
1997 2016-01-27  81.8394          24.2254     30.096176
3495 2022-02-25  86.9288          29.3148     35.185576
3496 2022-02-26  83.5485          25.9345     31.805276
3497 2022-03-01  93.5589          35.9449     41.815676
...         ...      ...              ...           ...
3932 2023-11-28  88.7045          31.0905     36.961276
3933 2023-11-29  88.6102          30.9962     36.866976
3934 2023-11-30  88.8841          31.2701     37.140876
3935 2023-12-01  88.5819          30.9679     36.838676
3936 2023-12-02  89.7619          32.1479     38.018676

[160 rows x 4 columns]
