In [1]:
# First things first, import:
import pandas as pd
import numpy as np
import hashlib

- In this notebook the following is tested:

    - Creating pandas DataFrame
    - Load a dataset from a file
    - Preview a dataframe
    - Convert datatypes in DataFrame

## Exercise 1: DataFrames

The goal of this exercise is to create a simple DataFrame from several data structures.

In [3]:
# RUN THIS CELL FIRST
# this is the data you'll use to fill each column of your dataframe
emojis = ['Face with Tears of Joy', 'Red Heart', 'Rolling on the Floor Laughing', 'Thumbs Up']
search_engines = np.array(['Google', 'Bing', 'Yahoo!', 'Baidu'])
social_network = ['Facebook', 'Instagram', 'TikTok', 'Twitter']
social_network_active_users = [2700000000, 1200000000, 700000000, 20000000]

In [12]:
# Add the data from the lists emojis, search_engines, social_network, and social_network_active_users 
# to a dictionary called most_popular_2021_dictionary:
#   - use the 4 variables created in the cell above to fill the data for each key
#   - each key should be a string containing the name of the corresponding variable.
# most_popular_2021_dictionary = 

# Create a dataframe called most_popular_2021_df
#   - set an index with the values 'first', 'second', 'third', 'fourth'
#   - use the dictionary created above to populate the dataframe.
# most_popular_2021_df = ...


# YOUR CODE HERE
column_names=["emojis","search_engines","social_network", "social_network_active_users"]
most_popular_2021_dictionary = dict(zip(column_names,[emojis,search_engines,social_network,social_network_active_users]))
most_popular_2021_df = pd.DataFrame(most_popular_2021_dictionary, index=['first','second','third', 'fourth'])
# YOUR CODE HERE

In [11]:
assert(isinstance(most_popular_2021_dictionary,dict)), 'Something is wrong! most_popular_2021_dictionary is not a dictionary.'
assert(isinstance(most_popular_2021_df, pd.DataFrame)), 'most_popular_2021 is not a DataFrame'
assert(most_popular_2021_df['emojis'].tolist()==emojis), "The emojis column doesn't look right."
assert(most_popular_2021_df['search_engines'].tolist()==list(search_engines)), "The search_engines column doesn't look right."
assert(most_popular_2021_df['social_network'].tolist()==social_network), "The social_network column doesn't look right."
assert(most_popular_2021_df.shape == (4, 4)), 'The size of the dataframe is not correct.'
assert(most_popular_2021_df.index.tolist() != ('first', 'second', 'third', 'fourth')), 'The index is not correct. Reread the instructions.'

## Exercise 2: Loading DataFrames from files

### 2.1 Load a dataset into a `ds_jobs` dataframe
Let's load a dataset with data about data science job applicants. It is a subset from a Kaggle dataset available [here](https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists).

In [18]:
# Load the dataset from the file located at data/ds_jobs.csv.
ds_jobs = pd.read_csv('../data/ds_jobs.csv')
# YOUR CODE HERE

# YOUR CODE HERE

In [16]:
# Print the dataframe head() to get an idea of what you've just loaded.
ds_jobs.head()

Unnamed: 0,id,g,exp,enr,ed,m,y_exp,t_job,cdi
0,32403,Male,True,Full time course,Graduate,STEM,9,1,0.827
1,9858,Female,True,no_enrollment,Graduate,STEM,5,1,0.92
2,31806,Male,False,no_enrollment,High School,,<1,never,0.624
3,27385,Male,True,no_enrollment,Masters,STEM,11,1,0.827
4,27724,Male,True,no_enrollment,Graduate,STEM,>20,>4,0.92


In [19]:
assert(isinstance(ds_jobs, pd.DataFrame)), "Something is wrong. ds_jobs does not look like a dataframe."
assert(ds_jobs.shape == (1003, 9)), "The shape is not correct. Did you follow all the instructions in the comments?"
assert(sum(ds_jobs.columns == ['id', 'g', 'exp', 'enr', 'ed', 'm', 'y_exp', 't_job', 'cdi']) == 9), "The columns don't look right."
assert(ds_jobs.id[3] == 27385 and ds_jobs.id[552] == 13748), "The id looks wrong."
assert(ds_jobs.id.max() == 33343), "Something is wrong. Did you follow all the instructions in the comments?"
assert(ds_jobs.enr[446] == 'no_enrollment'), "Something is wrong. Did you follow all the instructions in the comments?"

### 2.2 Load a dataset, but this time better

Notice that the column names in the ds_jobs dataframe are not very informative. This is not very useful to someone looking at the data. Instead we want to load the dataset with the following `column names`:
- `'id'`
- `'gender'`
- `'relevant_experience'` - whether the candidate has experience in the field
- `'enrollment_type'` - full or part time
- `'education'` - highest attained education
- `'major'` - major subject at university
- `'years_of_experience'` - years of job experience
- `'time_since_last_job'` - years passed since last job
- `'city_development_index'` - development level of home city

In [35]:
# Load the file at 'data/ds_jobs.csv' into a dataframe ds_jobs.
# set the column names to 'id', 'gender', 'relevant_experience', 'enrollment_type', 'education', 'major',
# 'years_of_experience', 'time_since_last_job', 'city_development_index' in this order.
# You will need to check the documentation at 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html to see how you can do this.
ds_jobs = pd.read_csv('../data/ds_jobs.csv', names=['id', 'gender', 'relevant_experience', 'enrollment_type', 'education', 'major', 'years_of_experience', 'time_since_last_job', 'city_development_index'])

ds_jobs
# YOUR CODE HERE

# YOUR CODE HERE

Unnamed: 0,id,gender,relevant_experience,enrollment_type,education,major,years_of_experience,time_since_last_job,city_development_index
1,32403,Male,True,Full time course,Graduate,STEM,9,1,0.8270000000000001
2,9858,Female,True,no_enrollment,Graduate,STEM,5,1,0.92
3,31806,Male,False,no_enrollment,High School,,<1,never,0.624
4,27385,Male,True,no_enrollment,Masters,STEM,11,1,0.8270000000000001
5,27724,Male,True,no_enrollment,Graduate,STEM,>20,>4,0.92
...,...,...,...,...,...,...,...,...,...
999,5485,Male,True,no_enrollment,Masters,STEM,7,1,0.802
1000,23288,Male,True,no_enrollment,Graduate,No Major,19,1,0.762
1001,33010,Male,False,no_enrollment,High School,,10,1,0.92
1002,4819,Male,True,Part time course,Masters,STEM,>20,1,0.884


In [37]:
# Print the dataframe head() to get an idea of what you've just loaded.
ds_jobs.head()

Unnamed: 0,id,gender,relevant_experience,enrollment_type,education,major,years_of_experience,time_since_last_job,city_development_index
1,32403,Male,True,Full time course,Graduate,STEM,9,1,0.8270000000000001
2,9858,Female,True,no_enrollment,Graduate,STEM,5,1,0.92
3,31806,Male,False,no_enrollment,High School,,<1,never,0.624
4,27385,Male,True,no_enrollment,Masters,STEM,11,1,0.8270000000000001
5,27724,Male,True,no_enrollment,Graduate,STEM,>20,>4,0.92


In [38]:
assert(isinstance(ds_jobs, pd.DataFrame)), "Something is wrong. ds_jobs does not look like a dataframe."
assert(ds_jobs.shape == (1003, 9)), "The shape is not correct. Did you follow all the instructions in the comments?"
assert(ds_jobs.shape != (1004, 9)), "Something is wrowg. You have 1 more row than expected. Did you tell pandas to use the 1st row as header?"
assert(sum(ds_jobs.columns == ['id', 'gender', 'relevant_experience', 'enrollment_type', 'education',
       'major', 'years_of_experience', 'time_since_last_job',
       'city_development_index']) == 9), "Don't forget to tell pandas the new column names."
assert(ds_jobs.id[6] == 21465 and ds_jobs.id[553] == 24331), "The index looks wrong."
assert(ds_jobs.education[5] == 'Masters'), "Something is wrong. Did you follow all the instructions in the comments?"
assert(ds_jobs.city_development_index.max() >= 0.949), "Something is wrong. Did you follow all the instructions in the comments?"
assert(ds_jobs.education[11] == 'Graduate'), "Something is wrong. Did you follow all the instructions in the comments?"

AssertionError: The index looks wrong.

### 2.3 Preview the datatypes

In [42]:
# Store the datatypes of all columns of ds_jobs in ds_jobs_dtypes.
# Use the method you learned in the learning notebook.
ds_jobs_dtypes = ds_jobs.dtypes

ds_jobs.info()
# Note: if you used the correct method, 
# the result will be a pandas series containing the datatypes of each column,
# with the index formed by the column names

# YOUR CODE HERE

# YOUR CODE HERE

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 1 to 1003
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1003 non-null   object
 1   gender                  771 non-null    object
 2   relevant_experience     1003 non-null   object
 3   enrollment_type         985 non-null    object
 4   education               981 non-null    object
 5   major                   862 non-null    object
 6   years_of_experience     999 non-null    object
 7   time_since_last_job     980 non-null    object
 8   city_development_index  1003 non-null   object
dtypes: object(9)
memory usage: 70.7+ KB


In [None]:
# Check your output - there should be object, float, bool, and integer types.
ds_jobs_dtypes

In [41]:
assert(sum([x in ds_jobs_dtypes.index for x in ds_jobs.columns]) == 9), "The index of ds_jobs_dtypes should contain all columns in ds_jobs."
assert(hashlib.sha256(str(ds_jobs_dtypes['relevant_experience']).encode()).hexdigest() == 'b760f44fa5965c2474a3b471467a22c43185152129295af588b022ae50b50903'), "The dtype of column 'relevant_experience' is not as expected."

AssertionError: The dtype of column 'relevant_experience' is not as expected.

### 2.4 Set the correct datatypes
The datatypes in `ds_jobs` were infered, so all `strings` are set as `objects`. Convert all these datatypes to `string` using a function you learned in the learning notebook.

In [43]:
# Set the correct datatypes in the ds_jobs dataframe - convert the objects to strings.
# Store the new dtypes in the variable ds_jobs_dtypes_converted.
ds_jobs = ds_jobs.astype(str)
ds_jobs_dtypes_converted = ds_jobs.dtypes

# YOUR CODE HERE

# YOUR CODE HERE

In [44]:
# Check you solution and compare it to the result of exercise 3.3. There will be pandas datatypes now (all or
# some, depending on which method you used).
ds_jobs_dtypes_converted

id                        object
gender                    object
relevant_experience       object
enrollment_type           object
education                 object
major                     object
years_of_experience       object
time_since_last_job       object
city_development_index    object
dtype: object

In [45]:
assert(sum([x in ds_jobs_dtypes_converted.index for x in ds_jobs.columns]) == 9), "The index of ds_jobs_dtypes_converted should contain all columns in ds_jobs."
assert(hashlib.sha256(str(ds_jobs_dtypes_converted['relevant_experience'])[:4].encode()).hexdigest() == 'b760f44fa5965c2474a3b471467a22c43185152129295af588b022ae50b50903'), "The dtype of column 'relevant_experience' is not as expected."
assert(hashlib.sha256(str(ds_jobs_dtypes_converted['city_development_index']).lower().encode()).hexdigest() == '6bd2a66c4467bc379fd21e11d74bfa2b0f8205baf39eefc20b2c4fecb198dd48'), "The dtype of column 'city_development_index' is not as expected."
assert(hashlib.sha256(str(ds_jobs_dtypes_converted['time_since_last_job']).encode()).hexdigest()=='473287f8298dba7163a897908958f7c0eae733e25d2e027992ea2edc9bed2fa8'), "The dtype of column 'time_since_last_job' is not as expected."

AssertionError: The dtype of column 'relevant_experience' is not as expected.

### 2.5 Get information about the dataframe size
Use a method you learned in the learning notebook to retrieve the `number of rows` and the `number of columns` in the ds_jobs dataframe.



In [53]:
number_of_rows = ds_jobs.shape[0]
number_of_columns = ds_jobs.shape[1]
number_of_rows

# YOUR CODE HERE

# YOUR CODE HERE

1003

In [54]:
assert(hashlib.sha256(str(int(number_of_rows)).encode()).hexdigest() == '8c9a013ab70c0434313e3e881c310b9ff24aff1075255ceede3f2c239c231623'), "The number of rows is not correct."
assert(hashlib.sha256(str(int(number_of_columns)).encode()).hexdigest() == '19581e27de7ced00ff1ce50b2047e7a567c76b1cbaebabe5ef03f7c3017bb5b7'), "The number of columns is not correct."

### 2.6 Load a json file into a dataframe
Let's load a new dataframe called hdi from the file stored at `../data/HDI.json`. It's the human development index statistics in the years 1990-2019, a subset of a kaggle dataset available [here](https://www.kaggle.com/datasets/elmartini/human-development-index-historical-data).

In [55]:
# Load the datafile from data/HDI.json and store it in the variable hdi. Use the appropriate method for json files.
hdi = pd.read_json('../data/HDI.json')

#YOUR CODE HERE

# YOUR CODE HERE

Unnamed: 0,HDI Rank,Country,1990,1995,2000,2005,2010,2015,2019
0,169,Afghanistan,0.302,0.331,0.35,0.418,0.472,0.5,0.511
1,69,Albania,0.65,0.637,0.671,0.706,0.745,0.788,0.795
2,91,Algeria,0.572,0.595,0.637,0.685,0.721,0.74,0.748
3,36,Andorra,,,0.813,0.827,0.837,0.862,0.868
4,148,Angola,,,0.4,0.46,0.517,0.572,0.581


In [None]:
# Preview your dataframe
hdi.head()

In [56]:
assert(isinstance(hdi, pd.DataFrame)), "Something is wrong. hdi does not look like a dataframe."
assert(hdi.shape == (189, 9)), "The shape is not correct. Did you follow all the instructions in the comments?"
assert(sum(hdi.columns == ['HDI Rank', 'Country', '1990', '1995', '2000', '2005', '2010', '2015',
       '2019']) == 9), "The columns don't look right."
assert(hdi.Country[13] == 'Bangladesh' and hdi.Country[52] == 'El Salvador'), "The Country column looks wrong."
assert(sum(hdi['HDI Rank']) > 136), "Something is wrong. Did you follow all the instructions in the comments?"
assert(sum(hdi['HDI Rank']) == 17914), "Something is wrong. Did you follow all the instructions in the comments?"

### 2.7 Get a numpy array of column names
Store the names of the `columns` in the hdi dataframe as a `numpy array`.

In [58]:
# First extract the columns into hdi_cols.
# Then convert the output into a NumPy array.

hdi_cols = hdi.columns

# Then convert the output into a NumPy array.
hdi_cols_array = np.array(hdi_cols)

# Always preview your variables to see the result of the operations.
print(hdi_cols, type(hdi_cols), "\n", sep="\n")
print(hdi_cols_array, type(hdi_cols_array), sep="\n")

#YOUR CODE HERE

# YOUR CODE HERE

Index(['HDI Rank', 'Country', '1990', '1995', '2000', '2005', '2010', '2015',
       '2019'],
      dtype='object')
<class 'pandas.core.indexes.base.Index'>


['HDI Rank' 'Country' '1990' '1995' '2000' '2005' '2010' '2015' '2019']
<class 'numpy.ndarray'>


In [59]:
# Always preview your variables to see the result of the operations.
print(hdi_cols, type(hdi_cols), "\n", sep="\n")
print(hdi_cols_array, type(hdi_cols_array), sep="\n")

Index(['HDI Rank', 'Country', '1990', '1995', '2000', '2005', '2010', '2015',
       '2019'],
      dtype='object')
<class 'pandas.core.indexes.base.Index'>


['HDI Rank' 'Country' '1990' '1995' '2000' '2005' '2010' '2015' '2019']
<class 'numpy.ndarray'>


In [60]:
assert(isinstance(hdi_cols, pd.core.indexes.base.Index)), "Use the method you learned to extract the columns into hdi_cols."
assert(len(hdi_cols) == 9), "There are 9 columns in the hdi dataframe. Did you extract them all? Also, make sure you don't change the variable hdi."
assert(isinstance(hdi_cols_array, np.ndarray)), "The hdi_cols_array does not look like a numpy array."

### 2.8 Extract the index as a numpy array
Do the same as in exercise 2.7, but not for the index of hdi.

In [61]:
# Extract the index using the method you learned.
# Convert it to a numpy array.

hdi_index = hdi.index

# Convert it to a numpy array.
hdi_index_array = np.array(hdi_index)

# Always preview your variables to see the result of the operations.
print(hdi_index, type(hdi_index), "\n", sep="\n")
print(hdi_index_array, type(hdi_index_array), sep="\n")

# YOUR CODE HERE

# YOUR CODE HERE

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            179, 180, 181, 182, 183, 184, 185, 186, 187, 188],
           dtype='int64', length=189)
<class 'pandas.core.indexes.numeric.Int64Index'>


[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188]
<c

In [62]:
assert(isinstance(hdi_index, pd.core.indexes.base.Index)), "Use the method you learned to extract the index into hdi_index."
assert(len(hdi_index) == 189), "The length of the hdi_index variable is incorrect."
assert(sum(hdi_index_array) == 17766), "Something is wrong with the index array."
assert(isinstance(hdi_index_array, np.ndarray)), "The hdi_index_array does not look like a numpy array."

### 2.9 Describe the data in your dataframe
Last but not least, remember how you can get some stats and info on your dataframe? If you don't, make sure to reread the learning notebook. If you do, let's jump to this final exercise.

Using only the two methods you learned to get information and statistics on a dataframe answer the three questions in the cell below manually.

In [None]:
# Use this draft cell to print stuff to help you answer the questions below.


In [71]:
# Question 1
# What is the mean value for HDI in the year 2019 (rounded to 2 decimal points)?
mean_HDI_2019 = hdi['2019'].mean()

# Question 2
# What is the maximum value for HDI in the year 1995 (round to 2 decimal points)?
max_HDI_1995 = hdi['1995'].max()

# Question 3 
# How many non-null entries do we have for the year 1990? Store the answer as an integer.
nonnull_HDI_1990 = hdi['1990'].count().item()

# YOUR CODE HERE

# YOUR CODE HERE

In [72]:
assert (isinstance(mean_HDI_2019, float)), "mean_HDI_2019 should be a float."
assert (isinstance(max_HDI_1995, float)), "max_HDI_1995 should be a float."
assert (isinstance(nonnull_HDI_1990, int)), "nonnull_HDI_1990 should be an integer."
np.testing.assert_almost_equal(float(mean_HDI_2019), 0.72, 2), "mean_HDI_2019 does not look right."
np.testing.assert_almost_equal(float(max_HDI_1995), 0.89, 2), "max_HDI_1995 does not look right."
assert(hashlib.sha256(str(int(nonnull_HDI_1990)).encode()).hexdigest() == '5ec1a0c99d428601ce42b407ae9c675e0836a8ba591c8ca6e2a2cf5563d97ff0'), "nonnull_HDI_1990 does not look right."