#### **Tip: Create time series data for testing**

In [2]:
import pandas as pd

In [1]:

num_rows = 365 * 24 #8760

df = pd.util.testing.makeTimeDataFrame(num_rows, freq='D') 
# H for hours

df.head()

Unnamed: 0,A,B,C,D
2000-01-01,1.586495,0.33272,-0.803394,-1.611213
2000-01-02,-0.291597,0.726422,0.090076,-0.892616
2000-01-03,0.352464,0.538417,0.069968,1.151359
2000-01-04,0.051614,1.316925,0.307857,-0.165411
2000-01-05,-0.722768,0.275185,0.64705,-0.5824


### **Tip: named aggregation - groupby**
https://twitter.com/justmarkham/status/1164167735275921408

In [3]:
df = pd.read_csv('http://bit.ly/kaggletrain')

df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [7]:
# Old approach
df_agg = df.groupby('Pclass').agg({'Age':['mean','max'], 'Survived':'mean'})

In [10]:
df_agg.columns = ['avg_age','max_age','survival_rate']
df_agg.head()

Unnamed: 0_level_0,avg_age,max_age,survival_rate
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,38.233441,80.0,0.62963
2,29.87763,70.0,0.472826
3,25.14062,74.0,0.242363


In [6]:
# tip : Named aggregation
df.groupby('Pclass').agg( avg_age=('Age','mean'), max_age=('Age','max'), survival_rate=('Survived','mean'))

Unnamed: 0_level_0,avg_age,max_age,survival_rate
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,38.233441,80.0,0.62963
2,29.87763,70.0,0.472826
3,25.14062,74.0,0.242363


### **Tip: Explode method in pandas**
https://twitter.com/justmarkham/status/1161001631942684672

In [18]:
# Tip 1:
data_dict = {
    'sandwich': ['PB&J','jam', 'hut'],
    'ingredients':[
        ['peanut butter','jelly'],
        ['bacon','lettuce','tomato'],
        ['swiss cheese']]
}
df = pd.DataFrame(data_dict, index=['a','b','c'])
df.head()

Unnamed: 0,sandwich,ingredients
a,PB&J,"[peanut butter, jelly]"
b,jam,"[bacon, lettuce, tomato]"
c,hut,[swiss cheese]


In [19]:
df.explode('ingredients')

Unnamed: 0,sandwich,ingredients
a,PB&J,peanut butter
a,PB&J,jelly
b,jam,bacon
b,jam,lettuce
b,jam,tomato
c,hut,swiss cheese


In [40]:
# Tip 2

data_dict = {
    'sandwich': ['PB&J','jam', 'hut'],
    'ingredients':['peanut butter,jelly', 'bacon,lettuce', 'tomato,swiss cheese']
}
df = pd.DataFrame(data_dict, index=['a','b','c'])
df

Unnamed: 0,sandwich,ingredients
a,PB&J,"peanut butter,jelly"
b,jam,"bacon,lettuce"
c,hut,"tomato,swiss cheese"


In [41]:
df = df.assign(ingredients=df.ingredients.str.split(','))
df = df.explode('ingredients')
df

Unnamed: 0,sandwich,ingredients
a,PB&J,peanut butter
a,PB&J,jelly
b,jam,bacon
b,jam,lettuce
c,hut,tomato
c,hut,swiss cheese


### **Tip:Need to create a DataFrame for testing?**
https://twitter.com/justmarkham/status/1148940650492170241

In [6]:
import pandas as pd

random_df = pd.util.testing.makeDataFrame()
print(random_df.head())

missing_df = pd.util.testing.makeMissingDataframe()
print(missing_df.head())

timeframe_df = pd.util.testing.makeTimeDataFrame()
print(timeframe_df.head())

mixed_df = pd.util.testing.makeMissingDataframe()
print(mixed_df.head())

                   A         B         C         D
fQUcEoSk6b  0.830116 -0.722361 -1.340578 -0.373693
puM3JjStoR  0.251520 -0.896158 -1.256201  0.365754
QgWpd5tjAb -0.392964 -0.051787  1.206860  0.483436
nTJLnLi8hb  0.581854 -0.452810  0.959416 -1.014599
QymbvQhpeA -0.060198 -0.511830 -0.570265 -0.315263
                   A         B         C         D
dyWtnF9Yzg -0.393404 -0.613947  0.084021       NaN
kXMYae95tG       NaN -0.940373       NaN -0.265041
BKWW6Xz7xe -1.043502 -0.215892  0.425937 -0.501606
RvxoCAeLim -0.893657 -0.527121 -0.433215 -0.920998
Dw5FnWsUcu       NaN -0.544391  0.696066 -1.114050
                   A         B         C         D
2000-01-03  1.115528  1.807331  0.557472  0.428877
2000-01-04  0.092019 -0.133650 -1.212409 -1.505503
2000-01-05 -1.770422  0.730711  0.412049  1.702897
2000-01-06  0.981157 -0.978873  0.362647  0.855069
2000-01-07  1.527640  0.155008 -0.715592  1.257967
                   A         B         C         D
NgeD6v5IUL  0.807675 -0.739810 

### **Tip: Need to convert a column from continuous to categorical?**
https://twitter.com/justmarkham/status/1146040449678925824

In [1]:
import pandas as pd

df = pd.read_csv('http://bit.ly/kaggletrain')

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df_cat = pd.cut(df.Survived, bins=[0, 25, 50, 75, 100], labels=['A','B','C','D'])

In [10]:
df_cat

0      NaN
1        A
2        A
3        A
4      NaN
      ... 
886    NaN
887      A
888    NaN
889      A
890    NaN
Name: Survived, Length: 891, dtype: category
Categories (4, object): [A < B < C < D]

### **Tip:**