In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [13]:
# pd.melt is similar to tidyr's gather function

In [14]:
name = ['daniel','john','jane']
trA = [np.NaN, 12, 24]
trB = [42, 31, 27]

In [15]:
df = pd.DataFrame({'name': name, 'treatment A': trA, 'treatment B': trB})
print(df)

     name  treatment A  treatment B
0  daniel          NaN           42
1    john         12.0           31
2    jane         24.0           27


In [16]:
# we call pd.melt
melted = pd.melt(frame=df, id_vars='name', value_vars=['treatment A', 'treatment B'], var_name='treatment', value_name='result')
print(melted)

     name    treatment  result
0  daniel  treatment A     NaN
1    john  treatment A    12.0
2    jane  treatment A    24.0
3  daniel  treatment B    42.0
4    john  treatment B    31.0
5    jane  treatment B    27.0


In [17]:
# the df.pivot() method is similar to tidyr's spread function

In [18]:
melted.pivot(index='name', columns = 'treatment', values = 'result')   # note we call pivot on the data frame itself

treatment,treatment A,treatment B
name,Unnamed: 1_level_1,Unnamed: 2_level_1
daniel,,42.0
jane,24.0,27.0
john,12.0,31.0


In [19]:
# creating a new variable

In [20]:
print(melted)

     name    treatment  result
0  daniel  treatment A     NaN
1    john  treatment A    12.0
2    jane  treatment A    24.0
3  daniel  treatment B    42.0
4    john  treatment B    31.0
5    jane  treatment B    27.0


In [21]:
# you can create a new variable by defining a new column
melted['gender'] = ['m','m','f','m','m','f']

In [22]:
print(melted)

     name    treatment  result gender
0  daniel  treatment A     NaN      m
1    john  treatment A    12.0      m
2    jane  treatment A    24.0      f
3  daniel  treatment B    42.0      m
4    john  treatment B    31.0      m
5    jane  treatment B    27.0      f


In [23]:
melted['trt'] = melted.treatment.str[-1]  # The last letter from the treatment column, as an abreviation for the treatment
print(melted)

     name    treatment  result gender trt
0  daniel  treatment A     NaN      m   A
1    john  treatment A    12.0      m   A
2    jane  treatment A    24.0      f   A
3  daniel  treatment B    42.0      m   B
4    john  treatment B    31.0      m   B
5    jane  treatment B    27.0      f   B


In [24]:
# pd.concat to concatenate tables:

name2 = ['amy','betty','carl']
trA2 = [20, 18, 10]
trB2 = [30, 38, 28]
df2 = pd.DataFrame({'name': name2, 'treatment A': trA2, 'treatment B': trB2})
print(df2)

    name  treatment A  treatment B
0    amy           20           30
1  betty           18           38
2   carl           10           28


In [25]:
print(df)

     name  treatment A  treatment B
0  daniel          NaN           42
1    john         12.0           31
2    jane         24.0           27


In [36]:
print(type(pd.concat([df, df2])))
pd.concat([df, df2])  # call pd.concat. provide it a *list* of data frames, not the dataframe directly

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name,treatment A,treatment B
0,daniel,,42
1,john,12.0,31
2,jane,24.0,27
0,amy,20.0,30
1,betty,18.0,38
2,carl,10.0,28


In [27]:
# when we use pd. concat, the original indexes are kept.
concatenated = pd.concat([df, df2]) 
concatenated.loc[0,]  # returns two rows

Unnamed: 0,name,treatment A,treatment B
0,daniel,,42
0,amy,20.0,30


In [28]:
print(concatenated.index)

Int64Index([0, 1, 2, 0, 1, 2], dtype='int64')


In [29]:
# you can reset the index during the concatenation process with ignore_index = True
concat2 = pd.concat([df, df2], ignore_index = True)
print(concat2)

     name  treatment A  treatment B
0  daniel          NaN           42
1    john         12.0           31
2    jane         24.0           27
3     amy         20.0           30
4   betty         18.0           38
5    carl         10.0           28


## importing multiple files with pattern matching
Could be useful if you are downloading several files with similar formats from a website (e.g. one for each month) 

In [30]:
import glob  # import this module

In [31]:
filenames = glob.glob('example*.csv')  # use to find filenames in your working directory that fit the pattern
print(filenames)

[]


In [32]:
list_data = []  # create empty list

for file in filenames:
    data = pd.read_csv(file, header = 0)
    list_data.append(data)

In [33]:
print(list_data)  # this is a list of data frames

[]


In [34]:
print(list_data[0])
print(type(list_data[0]))

IndexError: list index out of range

In [35]:
pd.concat(list_data, ignore_index = True)  # to make a single dataframe, we use pd.concat over the list of dataframes

ValueError: No objects to concatenate