In [2]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [None]:
# scrape table data from websites
rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

## Removing an unwanted column
Below are several ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on. Probably the easiest method is to use `pop` (method 3)

In [None]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

In [None]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

In [None]:
# method 3
df.pop('Unnamed: 3')
df

In [None]:
# take a column from a dataframe and assign it to a list variable
a = df['Character'] 

# omg its not a list 
type(a)

# now it is
a = list(a)

# could also do it more simple like this
a = list(df['Character'])

# lets check it out
a

In [None]:
# find the first four items in the list
a[0:4]

In [None]:
# find the last three items in the list
a[-3:]

In [None]:
# find items in the middle of the list
a[7:11]

In [None]:
# select the first 20 items from a list
b = a[0:20]
b

In [None]:
# remove the last item in a list
b.pop()
b

In [None]:
# inpsect the list to make sure the last item was removed


In [None]:
# remove a specific item from the list
b.pop(7)
b

In [None]:
# stick a Kermit on the end of the list
b.append('kermit')
b

In [None]:
# inspect the list to make sure Kermit was added
b

In [None]:
# insert an item into a list at a particular position
b.insert(5, 'kermit')

In [None]:
# replace an item in a list
b[5] = 'Fozzy'
b

In [None]:
# make a new list which adds "is a cute monster" to each item in the list

nadia_list = []
for i in b:
    new = i + " is a cute monster"
    nadia_list.append(new)

print(nadia_list)

In [None]:
# reset list b to original first 20 items from list a
b = a[0:20]
b

In [None]:
# make a list of your favorite monsters, and then make a new list which only includes the monsters 
# from list b that are also in your favorites list
my_fave_list = ['Baby Bear', 'Big Bird', 'Bruno', 'Anything Muppets']

# this for loop checks to see if the list b contains the items in my fave list and if they are it makes a new list
new_list = []
for i in b:
    if i in my_fave_list:
        new_list.append(i)

new_list
    

In [None]:
# alternative method that my smol brain can't comprehend
my_fave_list = ['Baby Bear', 'Big Bird', 'Bruno', 'Anything Muppets']
c =[x for x in b if x in my_fave_list]
c

In [None]:
# make a new list which includes the monsters from list b that are not in your favorites list
new_list_not_fave = []
for i in b:
    if i not in my_fave_list:
        new_list_not_fave.append(i)

new_list_not_fave

## More fun with lists

In [None]:
# add 10 to each number in d
d = [12, 2, 56, 89, 110]

e = [x + 10 for x in d]

e

In [None]:
# divide each number in d by 2
f = [x/2 for x in d]

f

## Dataframe manipulation

In [3]:
# download the student sleep data
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [4]:
# find the number of rows and columns in the dataframe
df.shape

(7, 6)

In [5]:
# make a new dataframe df1 which only includes the first 4 rows of the original dataframe
df1 = df.iloc[0:4]
df1

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [6]:
# make another new dataframe df2 which only includes rows 5 through the end of the original dataframe
df2 = df.iloc[-5:]

In [7]:
# make a third dataframe df3 with df2 on top of df1 (hint: use pd.concat)
df3 = pd.concat([df2, df1])
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [8]:
# overwrite df3 with df1 and df2 back in their original order
df3 = df
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [9]:
# make a new column called "average" which is the mean of the other columns for each row
df3['average'] = df3.mean(axis = 1)
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6,average
0,10,8,4,12,10,6,8.333333
1,7,8,3,14,5,6,7.166667
2,7,7,5,11,8,8,7.666667
3,8,9,6,10,9,5,7.833333
4,2,6,6,12,5,6,6.166667
5,5,7,5,14,6,7,7.333333
6,6,7,6,12,9,7,7.833333


In [10]:
# remove the average column from the dataframe
df3.pop('average')
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [11]:
# make a list of the means of each column in the dataframe
column_means = list(df3.mean())
column_means

[6.428571428571429,
 7.428571428571429,
 5.0,
 12.142857142857142,
 7.428571428571429,
 6.428571428571429]

In [12]:
# make a list of the means of each row in the dataframe
colnames = list(df3)
colnames

['Student 1', 'Student 2', 'Student 3', 'Student 4', 'Student 5', 'Student 6']

In [13]:
list(zip(colnames, column_means))

[('Student 1', 6.428571428571429),
 ('Student 2', 7.428571428571429),
 ('Student 3', 5.0),
 ('Student 4', 12.142857142857142),
 ('Student 5', 7.428571428571429),
 ('Student 6', 6.428571428571429)]

In [14]:
student_means = dict(zip(colnames, column_means))
student_means

{'Student 1': 6.428571428571429,
 'Student 2': 7.428571428571429,
 'Student 3': 5.0,
 'Student 4': 12.142857142857142,
 'Student 5': 7.428571428571429,
 'Student 6': 6.428571428571429}

In [None]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

In [None]:
# make a dataframe the mean hours of sleep for each student
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

In [None]:
# transpose the dataframe
df_transposed = df3.transpose()
df_transposed

In [None]:
colnames = list(df_transposed)
colnames

In [None]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

In [None]:
df_transposed.columns = newcols
df_transposed

In [None]:
df_transposed.index.name = 'student'
df_transposed

In [None]:
df_transposed.reset_index(inplace = True)
df_transposed

In [None]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long