In [6]:
import pandas as pd
pew = pd.read_csv('pew.csv')

In [7]:
# show only the first few columns
print(pew.iloc[:,0:5])

                   religion  <$10k  $10-20k  $20-30k  $30-40k
0                  Agnostic     27       34       60       81
1                   Atheist     12       27       37       52
2                  Buddhist     27       21       30       34
3                  Catholic    418      617      732      670
4        Don’t know/refused     15       14       15       11
5          Evangelical Prot    575      869     1064      982
6                     Hindu      1        9        7        9
7   Historically Black Prot    228      244      236      238
8         Jehovah's Witness     20       27       24       24
9                    Jewish     19       19       25       25
10            Mainline Prot    289      495      619      655
11                   Mormon     29       40       48       51
12                   Muslim      6        7        9       10
13                 Orthodox     13       17       23       32
14          Other Christian      9        7       11       13
15      

# Pandas DataFrames have a method called .melt() that will reshape the dataframe into a tidy format and it takes a few parameters:

 - id_vars is a container (list, tuple, ndarray) that represents the variables that will remain as is.
 - value_vars identifies the columns you want to melt down (or unpivot). By default, it will melt all the columns not specified in the id_vars parameter.
 - var_name is a string for the new column name when the value_vars is melted down. By default, it will be called variable.
 - value_name is a string for the new column name that represents the values for the var_name. By default, it will be called value.

In [8]:
# we do not need to specify a value_vars since we want to pivot
# all the columns except for the 'religion' column
pew_long = pew.melt(id_vars='religion')
print(pew_long)

                  religion            variable  value
0                 Agnostic               <$10k     27
1                  Atheist               <$10k     12
2                 Buddhist               <$10k     27
3                 Catholic               <$10k    418
4       Don’t know/refused               <$10k     15
..                     ...                 ...    ...
175               Orthodox  Don't know/refused     73
176        Other Christian  Don't know/refused     18
177           Other Faiths  Don't know/refused     71
178  Other World Religions  Don't know/refused      8
179           Unaffiliated  Don't know/refused    597

[180 rows x 3 columns]


In [9]:
# melt method
pew_long = pew.melt(id_vars='religion')

# melt function
pew_long = pd.melt(pew, id_vars='religion')

In [11]:
pew_long = pew.melt(
  id_vars ="religion", var_name="income", value_name ="count"
)
pew_long

# This view of the data is also known as “wide” data. To turn it into the “long” tidy data format, we will have to unpivot/melt/gather (depending on which statistical programming language we use) our dataframe.

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [12]:
pew_long = pew.melt(
  id_vars ="religion", var_name="income", value_name ="count"
)
print(pew_long)

                  religion              income  count
0                 Agnostic               <$10k     27
1                  Atheist               <$10k     12
2                 Buddhist               <$10k     27
3                 Catholic               <$10k    418
4       Don’t know/refused               <$10k     15
..                     ...                 ...    ...
175               Orthodox  Don't know/refused     73
176        Other Christian  Don't know/refused     18
177           Other Faiths  Don't know/refused     71
178  Other World Religions  Don't know/refused      8
179           Unaffiliated  Don't know/refused    597

[180 rows x 3 columns]


In [14]:
billboard = pd.read_csv('billboard.csv')

# look at the first few rows and columns
print(billboard.iloc[0:5, 0:16])

   year        artist                    track  time date.entered  wk1   wk2  \
0  2000         2 Pac  Baby Don't Cry (Keep...  4:22   2000-02-26   87  82.0   
1  2000       2Ge+her  The Hardest Part Of ...  3:15   2000-09-02   91  87.0   
2  2000  3 Doors Down               Kryptonite  3:53   2000-04-08   81  70.0   
3  2000  3 Doors Down                    Loser  4:24   2000-10-21   76  76.0   
4  2000      504 Boyz            Wobble Wobble  3:35   2000-04-15   57  34.0   

    wk3   wk4   wk5   wk6   wk7   wk8   wk9  wk10  wk11  
0  72.0  77.0  87.0  94.0  99.0   NaN   NaN   NaN   NaN  
1  92.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2  68.0  67.0  66.0  57.0  54.0  53.0  51.0  51.0  51.0  
3  72.0  69.0  67.0  65.0  55.0  59.0  62.0  61.0  61.0  
4  25.0  17.0  17.0  31.0  36.0  49.0  53.0  57.0  64.0  


# Keep Multiple Columns Fixed

In [16]:

# use a list to reference more than 1 variable
billboard_long = billboard.melt(
  id_vars =["year", "artist", "track", "time", "date.entered"],
  var_name ="week",
  value_name ="rating",
)

billboard_long

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0
...,...,...,...,...,...,...,...
24087,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,wk76,
24088,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,wk76,
24089,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,wk76,
24090,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,wk76,


# Columns Contain Multiple Variables

In [17]:
ebola = pd.read_csv('country_timeseries.csv')
print(ebola.columns)


Index(['Date', 'Day', 'Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone',
       'Cases_Nigeria', 'Cases_Senegal', 'Cases_UnitedStates', 'Cases_Spain',
       'Cases_Mali', 'Deaths_Guinea', 'Deaths_Liberia', 'Deaths_SierraLeone',
       'Deaths_Nigeria', 'Deaths_Senegal', 'Deaths_UnitedStates',
       'Deaths_Spain', 'Deaths_Mali'],
      dtype='object')


In [18]:
# print select rows and columns
print(ebola.iloc[:5, [ 0, 1, 2,10]])

         Date  Day  Cases_Guinea  Deaths_Guinea
0    1/5/2015  289        2776.0         1786.0
1    1/4/2015  288        2775.0         1781.0
2    1/3/2015  287        2769.0         1767.0
3    1/2/2015  286           NaN            NaN
4  12/31/2014  284        2730.0         1739.0


In [19]:
ebola_long = ebola.melt(id_vars=['Date', 'Day'])
print(ebola_long)

            Date  Day      variable   value
0       1/5/2015  289  Cases_Guinea  2776.0
1       1/4/2015  288  Cases_Guinea  2775.0
2       1/3/2015  287  Cases_Guinea  2769.0
3       1/2/2015  286  Cases_Guinea     NaN
4     12/31/2014  284  Cases_Guinea  2730.0
...          ...  ...           ...     ...
1947   3/27/2014    5   Deaths_Mali     NaN
1948   3/26/2014    4   Deaths_Mali     NaN
1949   3/25/2014    3   Deaths_Mali     NaN
1950   3/24/2014    2   Deaths_Mali     NaN
1951   3/22/2014    0   Deaths_Mali     NaN

[1952 rows x 4 columns]


# Split and Add Columns Individually

In [21]:
# get the variable column
# access the string methods
# and split the column based on a delimiter
variable_split = ebola_long.variable.str.split('_')
print(variable_split[:5])

0    [Cases, Guinea]
1    [Cases, Guinea]
2    [Cases, Guinea]
3    [Cases, Guinea]
4    [Cases, Guinea]
Name: variable, dtype: object


In [22]:
# the entire container
print(type(variable_split))

<class 'pandas.core.series.Series'>


In [23]:
# the first element in the container
print(type(variable_split[0]))

<class 'list'>


In [24]:
status_values = variable_split.str.get(0)
country_values = variable_split.str.get(1)
print(status_values)

0        Cases
1        Cases
2        Cases
3        Cases
4        Cases
         ...  
1947    Deaths
1948    Deaths
1949    Deaths
1950    Deaths
1951    Deaths
Name: variable, Length: 1952, dtype: object


In [25]:
ebola_long['status'] = status_values
ebola_long['country'] = country_values
print(ebola_long)

            Date  Day      variable   value  status country
0       1/5/2015  289  Cases_Guinea  2776.0   Cases  Guinea
1       1/4/2015  288  Cases_Guinea  2775.0   Cases  Guinea
2       1/3/2015  287  Cases_Guinea  2769.0   Cases  Guinea
3       1/2/2015  286  Cases_Guinea     NaN   Cases  Guinea
4     12/31/2014  284  Cases_Guinea  2730.0   Cases  Guinea
...          ...  ...           ...     ...     ...     ...
1947   3/27/2014    5   Deaths_Mali     NaN  Deaths    Mali
1948   3/26/2014    4   Deaths_Mali     NaN  Deaths    Mali
1949   3/25/2014    3   Deaths_Mali     NaN  Deaths    Mali
1950   3/24/2014    2   Deaths_Mali     NaN  Deaths    Mali
1951   3/22/2014    0   Deaths_Mali     NaN  Deaths    Mali

[1952 rows x 6 columns]


# Split and Combine in a Single Step

In [26]:
# reset our ebola_long data
ebola_long = ebola.melt(id_vars =['Date', 'Day'])

# split the column by _ into a dataframe using expand
variable_split = ebola_long.variable.str.split('_', expand=True)
print(variable_split)

           0       1
0      Cases  Guinea
1      Cases  Guinea
2      Cases  Guinea
3      Cases  Guinea
4      Cases  Guinea
...      ...     ...
1947  Deaths    Mali
1948  Deaths    Mali
1949  Deaths    Mali
1950  Deaths    Mali
1951  Deaths    Mali

[1952 rows x 2 columns]


In [27]:
ebola_long[['status', 'country']] = variable_split
print(ebola_long)

            Date  Day      variable   value  status country
0       1/5/2015  289  Cases_Guinea  2776.0   Cases  Guinea
1       1/4/2015  288  Cases_Guinea  2775.0   Cases  Guinea
2       1/3/2015  287  Cases_Guinea  2769.0   Cases  Guinea
3       1/2/2015  286  Cases_Guinea     NaN   Cases  Guinea
4     12/31/2014  284  Cases_Guinea  2730.0   Cases  Guinea
...          ...  ...           ...     ...     ...     ...
1947   3/27/2014    5   Deaths_Mali     NaN  Deaths    Mali
1948   3/26/2014    4   Deaths_Mali     NaN  Deaths    Mali
1949   3/25/2014    3   Deaths_Mali     NaN  Deaths    Mali
1950   3/24/2014    2   Deaths_Mali     NaN  Deaths    Mali
1951   3/22/2014    0   Deaths_Mali     NaN  Deaths    Mali

[1952 rows x 6 columns]
