## Import Libraries

In [1]:
## FinTech Solution Approach/Analyst Process Data Exercise
## First step is going to be to set libraries and dependencies to use.

import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Getting Started

In [2]:
## Next step is going to be pulling in the .csv data shared with us for this exercise.
## Note:  I downloaded these files to my working directory.

features_path = Path("Features_data_set.csv")
sales_path = Path("sales_data_set.csv")
stores_path = Path("stores_data_set.csv")

In [5]:
## I'm going to read_csv's and make my data frames.
## Note:  all the .csv files have a column "Store", so I'm using that as an index.

## First, the features path.
ftrs_df = pd.read_csv(features_path, index_col = "Store")

## Then, the sales path.
sale_df = pd.read_csv(sales_path, index_col = "Store")

## Then, the stores path.
strs_df = pd.read_csv(stores_path, index_col = "Store")

In [6]:
## Let's take a look at what we're working with.
ftrs_df

In [10]:
ftrs_df.isnull().sum()

Date               0
Temperature        0
Fuel_Price         0
MarkDown1       4158
MarkDown2       5269
MarkDown3       4577
MarkDown4       4726
MarkDown5       4140
CPI              585
Unemployment     585
IsHoliday          0
dtype: int64

In [12]:
## Lots of information there. I want to know how much data I'm working with, so I'm going to get the size/shape of the df.

ftrs_df.shape

(8190, 11)

In [13]:
## What percentage of this information is null/incomplete? 
## from my notes:  df.isnull().mean()*100

ftrs_df.isnull().mean()*100

In [15]:
## Looking at the data, it seems like the high-null % columns are numerically significant. I don't want to lose that data, so I want to replace NaN with 0.

ftrs_df.iloc[:,[4,5,6,7,8]] = ftrs_df.iloc[:,[4,5,6,7,8]].fillna(0)
ftrs_df

Unnamed: 0_level_0,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,05/02/2010,42.31,2.572,,0.00,0.00,0.00,0.00,211.096358,8.106,False
1,12/02/2010,38.51,2.548,,0.00,0.00,0.00,0.00,211.242170,8.106,True
1,19/02/2010,39.93,2.514,,0.00,0.00,0.00,0.00,211.289143,8.106,False
1,26/02/2010,46.63,2.561,,0.00,0.00,0.00,0.00,211.319643,8.106,False
1,05/03/2010,46.50,2.625,,0.00,0.00,0.00,0.00,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...
45,28/06/2013,76.05,3.639,4842.29,975.03,3.00,2449.97,3169.69,0.000000,,False
45,05/07/2013,77.50,3.614,9090.48,2268.58,582.74,5797.47,1514.93,0.000000,,False
45,12/07/2013,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,0.000000,,False
45,19/07/2013,82.84,3.737,2961.49,1047.07,204.19,363.00,1059.46,0.000000,,False


In [17]:
## Get summary statistics.
ftrs_df.describe()

## Questions to be answered:
<ul>
    <li>Mean temp.</li>
    <li>Mean fuel price.</li>
    <li>Sort CPI from highest to lowest.</li>
    <li>How do holidays affect CPI? Plot as a bar chart.</li>
    <li>How does unemployment affect CPI?</li>
</ul>

In [7]:
## Let's look at the rest of these files.
sale_df

Unnamed: 0_level_0,Dept,Date,Weekly_Sales,IsHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,05/02/2010,24924.50,False
1,1,12/02/2010,46039.49,True
1,1,19/02/2010,41595.55,False
1,1,26/02/2010,19403.54,False
1,1,05/03/2010,21827.90,False
...,...,...,...,...
45,98,28/09/2012,508.37,False
45,98,05/10/2012,628.10,False
45,98,12/10/2012,1061.02,False
45,98,19/10/2012,760.01,False


In [8]:
strs_df

Unnamed: 0_level_0,Type,Size
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A,151315
2,A,202307
3,B,37392
4,A,205863
5,B,34875
6,A,202505
7,B,70713
8,A,155078
9,B,125833
10,B,126512
