In [1]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

Collecting lxml
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.3.0
Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [2]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

Unnamed: 0,Character,Actor/Muppet performer,Description,Unnamed: 3
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...,
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn...",
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br...",
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri...",
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...,Writer Christopher Finch called Anything Muppe...
...,...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i...",
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ...",
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally...",
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...,


In [3]:
list(df)

['Character', 'Actor/Muppet performer', 'Description', 'Unnamed: 3']

## Removing an unwanted column
Below are several ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on. Probably the easiest method is to use `pop` (method 3)

In [4]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [None]:
# method 2
df = df.loc[:,list(df)[0:3]] 
df

In [None]:
# method 3
df.pop('Unnamed: 3')
df

In [6]:
# take a column from a dataframe and assign it to a list variable
characters = df['Character'].tolist()
print(characters)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Buster', 'Captain Vegetable', 'Clementine', 'Colambo', 'Cookie Monster', 'Count von Count', 'Countess Dahling von Dahling', 'Curly Bear', 'Deena', 'Dexter', 'Dingers', 'Don Music', 'Donald/Ronald Grump', 'Dr. Feel', 'Dr. Nobel Price', 'Elizabeth', 'Elmo', 'Elijah', 'Ernie', 'Farley', 'Fatima', 'Ferlinghetti Donizetti', 'Flo Bear', 'Forgetful Jones', 'Frazzle', 'Fred', 'Gabrielle', 'Gladys the Cow', 'Granny Bird', 'Mrs. Grouch', 'Gonger', 'Granny Fanny Nestlerode', 'Grover', 'Grundgetta', 'Gulliver', 'Guy Smiley', 'Harvey Kneeslapper', 'Herbert Birdsfoot', 'Herry Monster', 'Honkers', 'Hoots', 'Horatio', 'Humphrey', 'Ingrid', 'Jamie Fox', 'Ji-Young', 'Julia', 'Kermit[101]', 'Kingston Li

In [7]:
# find the first four items in the list
print(characters[0:4])

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie']


In [10]:
# find the last three items in the list
print(characters[-4:-1])

['The Twiddlebugs', 'The Two-Headed Monster', 'Wes']


In [None]:
# find items in the middle of the list


In [16]:
# select the first 20 items from a list
print(len(characters))
print(characters[60:62])

119
['Horatio', 'Humphrey']


In [19]:
# remove the last item in a list
print(characters)
characters.pop(-1)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Buster', 'Captain Vegetable', 'Clementine', 'Colambo', 'Cookie Monster', 'Count von Count', 'Countess Dahling von Dahling', 'Curly Bear', 'Deena', 'Dexter', 'Dingers', 'Don Music', 'Donald/Ronald Grump', 'Dr. Feel', 'Dr. Nobel Price', 'Elizabeth', 'Elmo', 'Elijah', 'Ernie', 'Farley', 'Fatima', 'Ferlinghetti Donizetti', 'Flo Bear', 'Forgetful Jones', 'Frazzle', 'Fred', 'Gabrielle', 'Gladys the Cow', 'Granny Bird', 'Mrs. Grouch', 'Gonger', 'Granny Fanny Nestlerode', 'Grover', 'Grundgetta', 'Gulliver', 'Guy Smiley', 'Harvey Kneeslapper', 'Herbert Birdsfoot', 'Herry Monster', 'Honkers', 'Hoots', 'Horatio', 'Humphrey', 'Ingrid', 'Jamie Fox', 'Ji-Young', 'Julia', 'Kermit[101]', 'Kingston Li

In [None]:
# inpsect the list to make sure the last item was removed
print(characters)

In [25]:
# remove a specific item from the list
characters.remove('Horatio')
print(characters)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Buster', 'Captain Vegetable', 'Clementine', 'Colambo', 'Cookie Monster', 'Count von Count', 'Countess Dahling von Dahling', 'Curly Bear', 'Deena', 'Dexter', 'Dingers', 'Don Music', 'Donald/Ronald Grump', 'Dr. Feel', 'Dr. Nobel Price', 'Elizabeth', 'Elmo', 'Elijah', 'Ernie', 'Farley', 'Fatima', 'Ferlinghetti Donizetti', 'Flo Bear', 'Forgetful Jones', 'Frazzle', 'Fred', 'Gabrielle', 'Gladys the Cow', 'Granny Bird', 'Mrs. Grouch', 'Gonger', 'Granny Fanny Nestlerode', 'Grover', 'Grundgetta', 'Guy Smiley', 'Harvey Kneeslapper', 'Herbert Birdsfoot', 'Herry Monster', 'Honkers', 'Hoots', 'Humphrey', 'Ingrid', 'Jamie Fox', 'Ji-Young', 'Julia', 'Kermit[101]', 'Kingston Livingston III', 'Lefty',

In [26]:
# stick a Kermit on the end of the list
characters.append('Kermit')

In [27]:
# inspect the list to make sure Kermit was added
print(characters)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Buster', 'Captain Vegetable', 'Clementine', 'Colambo', 'Cookie Monster', 'Count von Count', 'Countess Dahling von Dahling', 'Curly Bear', 'Deena', 'Dexter', 'Dingers', 'Don Music', 'Donald/Ronald Grump', 'Dr. Feel', 'Dr. Nobel Price', 'Elizabeth', 'Elmo', 'Elijah', 'Ernie', 'Farley', 'Fatima', 'Ferlinghetti Donizetti', 'Flo Bear', 'Forgetful Jones', 'Frazzle', 'Fred', 'Gabrielle', 'Gladys the Cow', 'Granny Bird', 'Mrs. Grouch', 'Gonger', 'Granny Fanny Nestlerode', 'Grover', 'Grundgetta', 'Guy Smiley', 'Harvey Kneeslapper', 'Herbert Birdsfoot', 'Herry Monster', 'Honkers', 'Hoots', 'Humphrey', 'Ingrid', 'Jamie Fox', 'Ji-Young', 'Julia', 'Kermit[101]', 'Kingston Livingston III', 'Lefty',

In [29]:
# insert an item into a list at a particular position
characters.insert(3, 'Froggy')

In [30]:
# replace an item in a list
characters[3]= 'Kermit2'

In [37]:
# make a new list which adds "is a cute monster" to each item in the list

cute_character_list = []
for i in characters:
    cute_characters = i + ' is a cute monster' 
    cute_character_list.append(cute_characters) 

print(cute_character_list)


['Abby Cadabby is a cute monster', 'Alice Snuffleupagus is a cute monster', 'Alistair Cookie is a cute monster', 'Kermit2 is a cute monster', 'Froggy is a cute monster', 'The Amazing Mumford is a cute monster', 'Anything Muppets is a cute monster', 'AM Monsters is a cute monster', 'Aristotle is a cute monster', 'Arlene Frantic is a cute monster', 'Baby Bear is a cute monster', 'Barkley is a cute monster', 'Beautiful Day Monster[broken anchor] is a cute monster', 'Bennett Snerf is a cute monster', 'Benny is a cute monster', 'Bert is a cute monster', 'Betty Lou is a cute monster', 'Biff is a cute monster', 'Big Bird is a cute monster', 'Bip Bippadotta is a cute monster', 'Bruno is a cute monster', 'Buster is a cute monster', 'Captain Vegetable is a cute monster', 'Clementine is a cute monster', 'Colambo is a cute monster', 'Cookie Monster is a cute monster', 'Count von Count is a cute monster', 'Countess Dahling von Dahling is a cute monster', 'Curly Bear is a cute monster', 'Deena is a 

In [39]:
# reset list b to original first 20 items from list a
cute_character_list[0:20] = characters[0:20]
print(cute_character_list)

['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'Kermit2', 'Froggy', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Aristotle', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno is a cute monster', 'Buster is a cute monster', 'Captain Vegetable is a cute monster', 'Clementine is a cute monster', 'Colambo is a cute monster', 'Cookie Monster is a cute monster', 'Count von Count is a cute monster', 'Countess Dahling von Dahling is a cute monster', 'Curly Bear is a cute monster', 'Deena is a cute monster', 'Dexter is a cute monster', 'Dingers is a cute monster', 'Don Music is a cute monster', 'Donald/Ronald Grump is a cute monster', 'Dr. Feel is a cute monster', 'Dr. Nobel Price is a cute monster', 'Elizabeth is a cute monster', 'Elmo is a cute monster', 'Elijah is a cute monster', 'Ernie is a cute monster', 'Farley is a cute monster', 'Fatima i

In [41]:
# make a list of your favorite monsters, and then make a new list which only includes the monsters 
# from list b that are also in your favorites list
favs = ['Mama Bear', 'Kermit', 'Slimey', 'Cookie monster', 'Rainbow Monster', 'Elmo', 'Aristotle', 'Cookie Monsters baby cousin']
new_list = []
for i in characters: 
    if i in favs:
        new_list.append(i)

print(new_list)

['Aristotle', 'Elmo', 'Mama Bear', 'Slimey', 'Kermit']


In [42]:
# make a new list which includes the monsters from list b that are not in your favorites list
new_list2 = []
for i in characters: 
    if not i in favs:
        new_list2.append(i)

print(new_list2)



['Abby Cadabby', 'Alice Snuffleupagus', 'Alistair Cookie', 'Kermit2', 'Froggy', 'The Amazing Mumford', 'Anything Muppets', 'AM Monsters', 'Arlene Frantic', 'Baby Bear', 'Barkley', 'Beautiful Day Monster[broken anchor]', 'Bennett Snerf', 'Benny', 'Bert', 'Betty Lou', 'Biff', 'Big Bird', 'Bip Bippadotta', 'Bruno', 'Buster', 'Captain Vegetable', 'Clementine', 'Colambo', 'Cookie Monster', 'Count von Count', 'Countess Dahling von Dahling', 'Curly Bear', 'Deena', 'Dexter', 'Dingers', 'Don Music', 'Donald/Ronald Grump', 'Dr. Feel', 'Dr. Nobel Price', 'Elizabeth', 'Elijah', 'Ernie', 'Farley', 'Fatima', 'Ferlinghetti Donizetti', 'Flo Bear', 'Forgetful Jones', 'Frazzle', 'Fred', 'Gabrielle', 'Gladys the Cow', 'Granny Bird', 'Mrs. Grouch', 'Gonger', 'Granny Fanny Nestlerode', 'Grover', 'Grundgetta', 'Guy Smiley', 'Harvey Kneeslapper', 'Herbert Birdsfoot', 'Herry Monster', 'Honkers', 'Hoots', 'Humphrey', 'Ingrid', 'Jamie Fox', 'Ji-Young', 'Julia', 'Kermit[101]', 'Kingston Livingston III', 'Lefty',

## More fun with lists

In [44]:
# add 10 to each number in d

d = [1, 2, 3, 4, 5]
d = [x + 10 for x in d]




In [45]:
# divide each number in d by 2
d = [x / 2 for x in d]
print(d)


[5.5, 6.0, 6.5, 7.0, 7.5]


## Dataframe manipulation

In [None]:
# download the student sleep data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

In [None]:
# find the number of rows and columns in the dataframe


In [None]:
# make a new dataframe df1 which only includes the first 4 rows of the original dataframe


In [None]:
# make another new dataframe df2 which only includes rows 5 through the end of the original dataframe


In [None]:
# make a third dataframe df3 with df2 on top of df1 (hint: use pd.concat)


In [None]:
# overwrite df3 with df1 and df2 back in their original order


In [None]:
# make a new column called "average" which is the mean of the other columns for each row
df3['average'] = df3.mean(axis = 1)
df3

In [None]:
# remove the average column from the dataframe
df3.pop('average')
df3

In [None]:
# make a list of the means of each column in the dataframe
column_means = list(df3.mean())
column_means

In [None]:
# make a list of the means of each row in the dataframe
colnames = list(df3)
colnames

In [None]:
list(zip(colnames, column_means))

In [None]:
student_means = dict(zip(colnames, column_means))
student_means

In [None]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

In [None]:
# make a dataframe the mean hours of sleep for each student
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

In [None]:
# transpose the dataframe
df_transposed = df3.transpose()
df_transposed

In [None]:
colnames = list(df_transposed)
colnames

In [None]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

In [None]:
df_transposed.columns = newcols
df_transposed

In [None]:
df_transposed.index.name = 'student'
df_transposed

In [None]:
df_transposed.reset_index(inplace = True)
df_transposed

In [None]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long