In [1]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [2]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
# read_html leder efter noget der ligner en tabel af data, og indlæser det 
df = rawdata[1]
df

Unnamed: 0,Character,Actor/Muppet performer,Description,Unnamed: 3
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...,
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn...",
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br...",
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri...",
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...,Writer Christopher Finch called Anything Muppe...
...,...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i...",
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ...",
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally...",
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...,


In [3]:
list(df)

['Character', 'Actor/Muppet performer', 'Description', 'Unnamed: 3']

## Removing an unwanted column
Below are several ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on. Probably the easiest method is to use `pop` (method 3)

In [4]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [5]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [6]:
# method 3
# df.pop('Unnamed: 3')
# df

In [7]:
# take a column from a dataframe and assign it to a list variable
a_list = list(df['Character'])
a_list
type(a_list)

list

In [8]:
# find the first four items in the list
a_list[0:4]

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford']

In [9]:
# find the last three items in the list
a_list[-3:]

['The Two-Headed Monster', 'Wes', 'Zoe']

In [10]:
# find items in the middle of the list
a_list[7:11]

['Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]']

In [11]:
# select the first 20 items from a list
b = a_list[0:20]

In [12]:
# remove the last item in a list
b.pop()

'Buster'

In [13]:
# inpsect the list to make sure the last item was removed
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno']

In [14]:
# remove a specific item from the list
b.pop(7)

'Arlene Frantic'

In [15]:
# stick a Kermit on the end of the list
b.append('Kermit')

In [16]:
# inspect the list to make sure Kermit was added
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [17]:
# insert an item into a list at a particular position
b.insert(5, 'Kermit')
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'Kermit',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [18]:
# replace an item in a list
b[5] = 'Fozzy' #vi skifter plads 5 ud 'kermit, med fozzy
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'Fozzy',
 'AM Monsters',
 'Aristotle',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [19]:
# make a new list which adds "is a cute monster" to each item in the list

# Mindst favorit, besværlig - med to loops
string = 'is a cute monster'
new_list = [] 
c = (x + string for x in a_list)
for muppet in c:
    new_list.append(muppet)
new_list
    

['Abby Cadabbyis a cute monster',
 'Alice Snuffleupagusis a cute monster',
 'Alistair Cookieis a cute monster',
 'The Amazing Mumfordis a cute monster',
 'Anything Muppetsis a cute monster',
 'AM Monstersis a cute monster',
 'Aristotleis a cute monster',
 'Arlene Franticis a cute monster',
 'Baby Bearis a cute monster',
 'Barkleyis a cute monster',
 'Beautiful Day Monster[broken anchor]is a cute monster',
 'Bennett Snerfis a cute monster',
 'Bennyis a cute monster',
 'Bertis a cute monster',
 'Betty Louis a cute monster',
 'Biffis a cute monster',
 'Big Birdis a cute monster',
 'Bip Bippadottais a cute monster',
 'Brunois a cute monster',
 'Busteris a cute monster',
 'Captain Vegetableis a cute monster',
 'Clementineis a cute monster',
 'Colambois a cute monster',
 'Cookie Monsteris a cute monster',
 'Count von Countis a cute monster',
 'Countess Dahling von Dahlingis a cute monster',
 'Curly Bearis a cute monster',
 'Deenais a cute monster',
 'Dexteris a cute monster',
 'Dingersis a c

In [20]:
d = [x + ' is a cute monster' for x in a_list] # Samme som før, men mere komprimeret version

d

['Abby Cadabby is a cute monster',
 'Alice Snuffleupagus is a cute monster',
 'Alistair Cookie is a cute monster',
 'The Amazing Mumford is a cute monster',
 'Anything Muppets is a cute monster',
 'AM Monsters is a cute monster',
 'Aristotle is a cute monster',
 'Arlene Frantic is a cute monster',
 'Baby Bear is a cute monster',
 'Barkley is a cute monster',
 'Beautiful Day Monster[broken anchor] is a cute monster',
 'Bennett Snerf is a cute monster',
 'Benny is a cute monster',
 'Bert is a cute monster',
 'Betty Lou is a cute monster',
 'Biff is a cute monster',
 'Big Bird is a cute monster',
 'Bip Bippadotta is a cute monster',
 'Bruno is a cute monster',
 'Buster is a cute monster',
 'Captain Vegetable is a cute monster',
 'Clementine is a cute monster',
 'Colambo is a cute monster',
 'Cookie Monster is a cute monster',
 'Count von Count is a cute monster',
 'Countess Dahling von Dahling is a cute monster',
 'Curly Bear is a cute monster',
 'Deena is a cute monster',
 'Dexter is a c

In [21]:
another_new_list = list() # Klar favorit 
for item in a_list: 
    another_new_list.append(item + ' is a cute monster')

another_new_list

['Abby Cadabby is a cute monster',
 'Alice Snuffleupagus is a cute monster',
 'Alistair Cookie is a cute monster',
 'The Amazing Mumford is a cute monster',
 'Anything Muppets is a cute monster',
 'AM Monsters is a cute monster',
 'Aristotle is a cute monster',
 'Arlene Frantic is a cute monster',
 'Baby Bear is a cute monster',
 'Barkley is a cute monster',
 'Beautiful Day Monster[broken anchor] is a cute monster',
 'Bennett Snerf is a cute monster',
 'Benny is a cute monster',
 'Bert is a cute monster',
 'Betty Lou is a cute monster',
 'Biff is a cute monster',
 'Big Bird is a cute monster',
 'Bip Bippadotta is a cute monster',
 'Bruno is a cute monster',
 'Buster is a cute monster',
 'Captain Vegetable is a cute monster',
 'Clementine is a cute monster',
 'Colambo is a cute monster',
 'Cookie Monster is a cute monster',
 'Count von Count is a cute monster',
 'Countess Dahling von Dahling is a cute monster',
 'Curly Bear is a cute monster',
 'Deena is a cute monster',
 'Dexter is a c

In [22]:
# reset list b to original first 20 items from list a
b = a_list[0:20]
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Buster']

In [23]:
# make a list of your favorite monsters, 
# and then make a new list which only includes the monsters 
# from list b that are also in your favorites list

fave_monsters = ['Barkley', 'Elmo', 'Big Bird']
new_favemonsters_list = list()
for name in b:
    if name in fave_monsters:
        new_favemonsters_list.append(name)

print(new_favemonsters_list)

# another way to do it 

favs = ['Barkley', 'Elmo', 'Big Bird']
c = [x for x in b if x in favs]
print(c)

['Barkley', 'Big Bird']
['Barkley', 'Big Bird']


In [24]:
# make a new list which includes the monsters from list b that are not in your favorites list
not_faves = list()
for name in b:
    if name not in fave_monsters:
        not_faves.append(name)
print(not_faves)

# smider bare not in i koden (i løkken) ovenfor


['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Bip Bippadotta', 'Bruno', 'Buster']


## More fun with lists

In [25]:
# add 10 to each number in d
d = [3, 5, 8789, 66]
e = [x + 10 for x in d]
print(e)

[13, 15, 8799, 76]


In [26]:
# divide each number in d by 2
f = [x / 2 for x in d]
f

[1.5, 2.5, 4394.5, 33.0]

## Dataframe manipulation

In [27]:
# download the student sleep data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [28]:
# find the number of rows and columns in the dataframe
df.shape

(7, 6)

In [29]:
# make a new dataframe df1 which only includes the first 4 rows of the original dataframe
df1 = df.iloc[0:4]
df1

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [30]:
# make another new dataframe df2 which only includes rows 5 through the end of the original dataframe
df2 = df.iloc[4:df.shape[0]]
df2

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [31]:
# make a third dataframe df3 with df2 on top of df1 (hint: use pd.concat)

df3 = pd.concat([df2, df1])
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [32]:
# overwrite df3 with df1 and df2 back in their original order
df3 = df
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [33]:
# make a new column called "average" which is the mean of the other columns for each row
df3['average'] = df3.mean(axis = 1)
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6,average
0,10,8,4,12,10,6,8.333333
1,7,8,3,14,5,6,7.166667
2,7,7,5,11,8,8,7.666667
3,8,9,6,10,9,5,7.833333
4,2,6,6,12,5,6,6.166667
5,5,7,5,14,6,7,7.333333
6,6,7,6,12,9,7,7.833333


In [34]:
# remove the average column from the dataframe
df3.pop('average')
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [35]:
# make a list of the means of each column in the dataframe
column_means = list(df3.mean())
column_means

[6.428571428571429,
 7.428571428571429,
 5.0,
 12.142857142857142,
 7.428571428571429,
 6.428571428571429]

In [36]:
# make a list of the means of each row in the dataframe
colnames = list(df3)
colnames

['Student 1', 'Student 2', 'Student 3', 'Student 4', 'Student 5', 'Student 6']

In [37]:
list(zip(colnames, column_means))

[('Student 1', 6.428571428571429),
 ('Student 2', 7.428571428571429),
 ('Student 3', 5.0),
 ('Student 4', 12.142857142857142),
 ('Student 5', 7.428571428571429),
 ('Student 6', 6.428571428571429)]

In [38]:
student_means = dict(zip(colnames, column_means))
student_means

{'Student 1': 6.428571428571429,
 'Student 2': 7.428571428571429,
 'Student 3': 5.0,
 'Student 4': 12.142857142857142,
 'Student 5': 7.428571428571429,
 'Student 6': 6.428571428571429}

In [39]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

Student 4's average: 12.143


In [40]:
# make a dataframe the mean hours of sleep for each student
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

Unnamed: 0,Students,Sleep Hours
0,Student 1,6.428571
1,Student 2,7.428571
2,Student 3,5.0
3,Student 4,12.142857
4,Student 5,7.428571
5,Student 6,6.428571


In [41]:
# transpose the dataframe
df_transposed = df3.transpose()
df_transposed

Unnamed: 0,0,1,2,3,4,5,6
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [42]:
colnames = list(df_transposed)
colnames

[0, 1, 2, 3, 4, 5, 6]

In [43]:
# using list comprehension to take each columns name, 
# add 1 to it, make it into a string, put day in front of that, and do it to all 

newcols = ['Day ' + str(x+1) for x in colnames]
newcols

['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5', 'Day 6', 'Day 7']

In [44]:
df_transposed.columns = newcols
df_transposed

Unnamed: 0,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [45]:
df_transposed.index.name = 'student'
df_transposed

Unnamed: 0_level_0,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [46]:
df_transposed.reset_index(inplace = True)
df_transposed

Unnamed: 0,student,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
0,Student 1,10,7,7,8,2,5,6
1,Student 2,8,8,7,9,6,7,7
2,Student 3,4,3,5,6,6,5,6
3,Student 4,12,14,11,10,12,14,12
4,Student 5,10,5,8,9,5,6,9
5,Student 6,6,6,8,5,6,7,7


In [47]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long

Unnamed: 0,student,variable,value
0,Student 1,Day 1,10
1,Student 2,Day 1,8
2,Student 3,Day 1,4
3,Student 4,Day 1,12
4,Student 5,Day 1,10
5,Student 6,Day 1,6
6,Student 1,Day 2,7
7,Student 2,Day 2,8
8,Student 3,Day 2,3
9,Student 4,Day 2,14
