# Read a CSV-File into a DataFrame and Describe

## References
Official Pandas documentation on reading CSV-files: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html<br>
Chapter on reading CSV-files in Wes McKinneys book: https://github.com/wesm/pydata-book/blob/2nd-edition/ch06.ipynb (First part)

## Kaggle Rossmann Data


On kaggle.com, you can find a data set provided by the company Rossmann for a challenge.
This data is expected to be available in a folder on the same level as the folder containing these Jupyter notebooks. <p>
The data consists of 3 files
<ul>
<li>store.csv</li>
<li>train.csv</li>
<li>test.csv</li>
</ul>
<br>
From their sizes, you can infer that it might be save to load the data on the Rossmann stores completely.<br>
You might want to be more careful with the other data. 

In [1]:
import pandas as pd

In [2]:
fileStore = "../ZZ_Data/Kaggle_Rossmann/store.csv"
fileTrain = "../ZZ_Data/Kaggle_Rossmann/train.csv"
fileTest = "../ZZ_Data/Kaggle_Rossmann/test.csv"

In [3]:
df = pd.read_csv(fileStore)
df.head(3)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"


## Limit number of data read from file

In [4]:
df = pd.read_csv(fileTrain, nrows = 3)
df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1


In [5]:
colsIwannaUse = ["Store", "Date", "Customers"]
df = pd.read_csv(fileTrain, usecols = colsIwannaUse, nrows = 5)
df

Unnamed: 0,Store,Date,Customers
0,1,2015-07-31,555
1,2,2015-07-31,625
2,3,2015-07-31,821
3,4,2015-07-31,1498
4,5,2015-07-31,559


In [6]:
%%time
colsIwannaUse = ["Store", "Date", "Customers", "Sales"]
### df = pd.read_csv(fileTrain) Took about half a second
df = pd.read_csv(fileTrain, parse_dates=["Date"], dtype={"Store":'str',"Customers":'int64',"Sales":'float'}, usecols = colsIwannaUse) 
df.tail()
df

CPU times: user 253 ms, sys: 101 ms, total: 354 ms
Wall time: 363 ms


Unnamed: 0,Store,Date,Sales,Customers
0,1,2015-07-31,5263.0,555
1,2,2015-07-31,6064.0,625
2,3,2015-07-31,8314.0,821
3,4,2015-07-31,13995.0,1498
4,5,2015-07-31,4822.0,559
...,...,...,...,...
1017204,1111,2013-01-01,0.0,0
1017205,1112,2013-01-01,0.0,0
1017206,1113,2013-01-01,0.0,0
1017207,1114,2013-01-01,0.0,0


## Delimiter comma and header in top row can, but need not be specified

In [8]:
df = pd.read_csv(fileStore, delimiter = ",", header=0)
df.head(3)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"


## Converter

In [26]:
def roundSales(x):
    x = round(float(x),-2)
    x = int(x)
    return x
def dayOfWeek(x):
    if x == "1":
        x = "Mo"
    elif x == "2":
        x = "Th"
    elif x == "3":
        x = "Th"        
    elif x == "4":
        x = "Th"
    elif x == "5":
        x = "Fr"
    elif x == "6":
        x = "Sa"        
    else:
        x = "So"
    return x

colsIwannaUse = ["Store", "Date", "Customers", "Sales", "DayOfWeek"]

df = pd.read_csv(fileTrain, parse_dates=["Date"], converters={"DayOfWeek": dayOfWeek, "Sales": roundSales}, dtype={"Store":'str',"Customers":'int64'}, usecols = colsIwannaUse) 

df[2000:20000:1000]

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers
2000,886,Th,2015-07-30,7700,655
3000,771,Th,2015-07-29,8500,774
4000,656,Th,2015-07-28,4000,411
5000,541,Mo,2015-07-27,10200,1088
6000,426,So,2015-07-26,0,0
7000,311,Sa,2015-07-25,5500,560
8000,196,Fr,2015-07-24,4400,598
9000,81,Th,2015-07-23,6600,512
10000,1081,Th,2015-07-23,5600,956
11000,966,Th,2015-07-22,3600,469


In [18]:
df.describe()

Unnamed: 0,Sales,Customers
count,1017209.0,1017209.0
mean,5773.804,633.1459
std,3850.021,464.4117
min,0.0,0.0
25%,3700.0,405.0
50%,5700.0,609.0
75%,7900.0,837.0
max,41600.0,7388.0
