## Key Objective:

#### To demonstrate how one can use python (jupyter notebooks) to combine multiple files (e.g.excel in this case) for easier data manipulation, cleaning and analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!ls ../Combining_Multiple_Files

[34mdata[m[m         script.ipynb


In [3]:
!ls ../Combining_Multiple_Files/data

customer-status.xlsx sales-jan-2014.xlsx
sales-feb-2014.xlsx  sales-mar-2014.xlsx


In [4]:
import glob

In [8]:
glob.glob("../Combining_Multiple_Files/data/sales*.xlsx")

['../Combining_Multiple_Files/data/sales-feb-2014.xlsx',
 '../Combining_Multiple_Files/data/sales-mar-2014.xlsx',
 '../Combining_Multiple_Files/data/sales-jan-2014.xlsx']

#### Initialize an empty dataframe and to be used in appending all the individual files

In [6]:
all_data = pd.DataFrame()
for file in glob.glob("../Combining_Multiple_Files/data/sales*.xlsx"):
    df = pd.read_excel(file)
    all_data = all_data.append(df,ignore_index=True)

In [7]:
all_data.describe()

ValueError: Cannot describe a DataFrame without columns

In [None]:
all_data.head()

In [None]:
all_data.info()

#### Convert date column from object to datetime 

In [None]:
all_data['date'] = pd.to_datetime(all_data['date'])

In [None]:
status = pd.read_excel("../Combining_Multiple_Files/data/customer-status.xlsx")
status

#### Merge all_data and status

In [None]:
all_data_status = pd.merge(all_data, status, how='left')
all_data_status.head()

#### Doing Sanity Checks - investigating specific accounts

In [None]:
all_data_status[all_data_status["account number"]==737550].head()

##### This account number was not in our status file, hence the NaN's. For this specific case, let's label all missing accounts as bronze - use the fillna

In [None]:
all_data_status['status'].fillna('bronze',inplace=True)
all_data_status.head()

In [None]:
#testing again to see if change went through
all_data_status[all_data_status["account number"]==737550].head()

In [None]:
pd.__version__

#### Change 'status' to category data type

In [None]:
all_data_status["status"] = all_data_status["status"].astype("category")

In [None]:
all_data_status.head()

In [None]:
all_data_status.dtypes

#### Sorting the dataframe by status column

In [None]:
all_data_status.sort_values('status').head()

#### The above operation sorted in alphabetical order
#### Let's change so that it sorts by our own customer order (the olympic ordering)

In [None]:
all_data_status["status"].cat.set_categories([ "gold","silver","bronze"],inplace=True)

In [None]:
all_data_status.sort_values('status').head()

#### Getsome summary info on the status

In [None]:
all_data_status["status"].describe()

#### Some more useful info about the data

In [None]:

all_data_status.groupby(["status"])["quantity","unit price","ext price"].agg([np.sum,np.mean, np.std])