In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
plt.style.use('ggplot')

from bokeh.charts import Line, output_file, output_notebook, show

In [2]:
repairs_2014 = pd.read_csv('data/historical-repairs-2014-merge.csv',
                           encoding="utf-8", index_col=False, parse_dates=['logged-date'])

repairs_2015 = pd.read_csv('data/historical-repairs-2015-merge.csv',
                           encoding="utf-8", index_col=False, parse_dates=['logged-date'])

repairs_2016 = pd.read_csv('data/historical-repairs-2016-mini.csv',
                           encoding="utf-8", index_col=False, parse_dates=['logged-date']) 

In [3]:
def count_repairs_by_date_all_repairs_df(df):
    tmp_df = df.drop(['property-reference', 'repair-number', 'description-for-code'], axis=1)
    tmp_df['number of repairs'] = pd.Series([1] * len(tmp_df))
    counts = tmp_df.groupby(['logged-date'], as_index=False).sum()
    return counts

In [4]:
counts_2014_df = count_repairs_by_date_all_repairs_df(repairs_2014)

counts_2015_df = count_repairs_by_date_all_repairs_df(repairs_2015)

counts_2016_df = count_repairs_by_date_all_repairs_df(repairs_2016)

In [5]:
#counts_2015_df.head()

In [6]:
#counts_2016_df.head()

In [7]:
join_daily_repairs = pd.concat([counts_2014_df, counts_2015_df, counts_2016_df])

In [8]:
daily_repairs_plot = Line(join_daily_repairs, x='logged-date', y='number of repairs', color='blue', 
                          title='Number of repairs between 2014 and 2016',
                          xlabel="Date an issue was reported", plot_width=1000)

output_notebook()

show(daily_repairs_plot)

In [9]:
year_month = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
              7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

In [10]:
def daily_to_monthly_counts(df):
    df['logged-date'] = [pd.to_datetime(d) for d in df['logged-date']]
    df['logged month'] = pd.DatetimeIndex(df['logged-date']).to_period('M')
    df['logged month'] = [year_month.get(d) for d in df['logged month'].dt.month]
    tmp_df = df.drop(['logged-date'], axis=1)
    monthly_counts = tmp_df.groupby('logged month', as_index=False).sum()
    return monthly_counts

In [11]:
monthly_counts_2014_df = daily_to_monthly_counts(counts_2014_df).rename(columns={'number of repairs': 
                                                                                 'number of repairs in 2014'})

monthly_counts_2015_df = daily_to_monthly_counts(counts_2015_df).rename(columns={'number of repairs': 
                                                                                 'number of repairs in 2015'})

monthly_counts_2016_df = daily_to_monthly_counts(counts_2016_df).rename(columns={'number of repairs': 
                                                                                 'number of repairs in 2016'})

In [12]:
#monthly_counts_2014_df

In [13]:
joined_df = pd.merge(monthly_counts_2014_df, monthly_counts_2015_df, 
                     on=['logged month', 'logged month'], how='outer')

joined_df2 = pd.merge(joined_df, monthly_counts_2016_df, on=['logged month', 'logged month'], how='outer')

joined_df2 = joined_df2.fillna(0)

In [14]:
#joined_df2

In [15]:
monthly_counts = joined_df2.reindex(index=[4, 3, 7, 0, 8, 6, 5, 1, 11, 10, 9, 2])

In [16]:
#monthly_counts

In [17]:
#monthly_counts.plot(x='logged month', y=['number of repairs in 2014', 'number of repairs in 2015', 'number of repairs in 2016'],
                    #figsize=(16, 8), title='Number of repairs per months')

### Were all the repair events added to the dataset in 2016?