# Data Wrangling


---

### Deduplikacja

In [None]:
import pandas as pd
df = pd.DataFrame({"A":["foo", "foo", "foo", "bar"], "B":[0,1,1,1], "C":["A","A","B","A"]})
df

In [None]:
df.drop_duplicates(subset=['A', 'C'], keep="first")

In [None]:
df.drop_duplicates(subset=['A', 'C'], keep="last")

In [None]:
df.drop_duplicates(subset=['A', 'C'], keep=False)

In [None]:
df.drop_duplicates(subset=['A', 'C'], keep=False).reset_index(drop=True)

---
# Przykład Data wrangling

In [None]:
import pandas as pd

url = 'data/blood.csv'

df_blood = pd.read_csv(url, sep = ',')
df_blood

In [None]:
columns = list(df_blood.columns)
columns

In [None]:
columns[0] = 'Date'
df_blood.columns = columns

df_blood

In [None]:
df_blood.dtypes

### Datetime columns

In [None]:
pd.to_datetime(df_blood.Date, format='%Y-%m-%d')

---
### How to `format`

https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

---

In [None]:
pd.to_datetime(df_blood.Date)

In [None]:
df_blood.Date = pd.to_datetime(df_blood.Date, format='%Y-%m-%d')

df_blood

In [None]:
proper_dates = pd.date_range(start=df_blood.Date.min(), end=df_blood.Date.max(), freq='D')
proper_dates

In [None]:
proper_dates.to_frame().reset_index(drop=True).rename(columns={0: "Date"})

In [None]:
df_blood = pd.merge(proper_dates.to_frame().reset_index(drop=True).rename(columns={0: "Date"}), df_blood, on='Date', how="left")
df_blood

In [None]:
df_blood.dtypes

In [None]:
df_blood['Morning Sys'].astype('Int64')

In [None]:
df_blood.Date.dt.day

In [None]:
df_blood.Date.dt.weekday

### Categorical variables

In [None]:
df_blood.Date.dt.weekday.astype('category')  

In [None]:
df_blood['Day'] = df_blood.Date.dt.weekday.astype('category' )  

df_blood

In [None]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df_blood.Day = df_blood.Day.cat.rename_categories(days)

df_blood

In [None]:
df = df_blood.set_index('Date', drop=True)
df

In [None]:
df["Morning Sys"].fillna(df["Morning Sys"].mean())

In [None]:
df.fillna(method='ffill', inplace=True)

df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')

df = df.reset_index()
plt.style.use("dark_background")
plt.figure(figsize=(24,12))


chart = plt.plot(df.Date,
                     df['Morning Dia'],
                     color='orange', 
                    )
chart = plt.plot(df.Date,
                     df['Morning Sys'],
                     color='green', 
                    )

## Średnie kroczące

In [None]:
df["rolling Morning Dia"] = df["Morning Dia"].rolling(5).mean()
df["rolling Morning Sys"] = df["Morning Sys"].rolling(5).mean()

df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.dates as mdates

plt.figure(figsize=(24,12))
plt.style.use("dark_background")

chart = plt.plot(df.Date,
                     df['Morning Dia'],
                     color='orange',
                     linestyle='--' 
                    )
chart = plt.plot(df.Date,
                     df['Morning Sys'],
                     color='green',
                     linestyle='--' 
                    )
chart = plt.plot(df.Date,
                     df['rolling Morning Dia'],
                     color='yellow', 
                     data=df
                    )
chart = plt.plot(df.Date,
                     df['rolling Morning Sys'],
                     color='cyan', 
                     data=df
                    )



In [None]:
df.to_csv('data/fixed_blood.csv')