# DATA CLEANING

### String to Int | Delete $ (Strip)

In [None]:
# Remove $ from Revenue column
sales['Revenue'] = sales['Revenue'].str.strip('$')
sales['Revenue'] = sales['Revenue'].astype('int')

In [None]:
# Verify that Revenue is now an integer
assert sales['Revenue'].dtype == 'int'

## The assert statement

In [19]:
# This will pass
assert 1+1 == 2

## Numeric or categorical?

In [None]:
# Convert to categorical
df["marriage_status"] = df["marriage_status"].astype('category')
df.describe()

## Can future sign-ups exist?

In [None]:
# Import date time
import datetime as dt
today_date = dt.date. today()
user_signups[user_signups|'subscription_date'] > dt. date. today ()1

### DROP VALUES (TWO WAYS)

In [None]:
# Drop values using filtering
movies = movies[movies['avg_rating'] ≤ 5]
# Drop values using .drop()
movies.drop (movies[movies['avg_rating'] > 5].index, inplace =

## Check deleting values

In [None]:
# Assert results
assert movies['avg_rating'].max() <=

## Third Way - Replace Values

In [None]:
# Convert avg_rating > 5 to 5
movies.loc[movies['avg_rating'] ≥ 5, 'avg_rating'] = 5

# Cleaning Date

## Convert to Date

In [None]:
# Convert to date
user_signupsI'subscription_date'] = pd.to_datetime(user_signups['subscription_date ']).dt.date

In [None]:
today = dt.date.today()

# Drop the data
# Drop values using filtering
user_signups = user_signups[user_signups['subscription_date'] ≤ today_date]

# Drop values using .drop()
user_signups.drop(user_signups[user_signups['subscription_date'] > today_date].index, inplace = True)

In [None]:
# Drop values using filtering
user_signups.loc[user_signups['subscription_date'] > today_date, 'subscription_date'] = today_date

## Duplicate

In [None]:
# Get duplicates across all columns
duplicates = height_weight.duplicated()
print(duplicates)

True, False

In [None]:
height_weight['duplicates']

### Check duplicated (3 cloumns)

In [None]:
# Column names to check for duplication
column_names = ['first_name','last_name', 'address']
duplicates = height_weight.duplicated(subset = column_names, keep = False)

In [None]:
# Output duplicate values
height_weight[duplicates].sort_values(by = 'first_name')

### Drop complete duplicates

In [None]:
# Drop duplicates
height_weight.drop_duplicates(inplace = True)

### A statistical measure to combine each set of duplicated values

#### The groupby and agg() methods

In [None]:
# Group by column names and produce statistical summaries
column_names = ['first_name'|'last_name', 'address']
summaries = {'height': 'max', 'weight': 'mean'}
height_weight = height_weight.groupby(by = column_names).agg(summaries).reset_index()

In [None]:
# Make sure aggregation is done
duplicates = height_weight.duplicated(subset = column_names, keep = False)
height_weight[duplicates].sort_values(by = 'first_name')

# Membership constraints

### Finding inconsistent categories (not in category)

In [None]:
inconsistent_categories = set(study_data['blood_type').difference(categories['blood_type'])
print(inconsistent_categories)

In [None]:
# Get and print rows with inconsistent categories
inconsistent_rows = study_data['blood_type'].isin(inconsistent_categories)
study_data[inconsistent_rows]

### Dropping inconsistent categories

In [None]:
# Drop inconsistent categories and get consistent data only
consistent_data = study_data[~inconsistent_rows]

### Value consistency

In [None]:
# Capitalize
marriage_status[marriage_status'] = marriage_status[ 'marriage_status'].str.upper ()
marriage_status['marriage_status'].value_counts()
# UNMARRIED MARRIED

# Lowercase
marriage_status['marriage_status'] = marriage_status[ 'marriage_status '].str.lower()
marriage_status['marriage_status'].value_counts)

### Trailing spaces: married , " married', 'unmarried ', " unmarried'.

In [None]:
# Get marriage status column
marriage_status = demographics['marriage_status']
marriage_status.value_counts)

In [None]:
# Strip all spaces
demographics = demographics['marriage_status'].str.strip()
demographics| 'marriage_status'].value_counts()

### CREATE CATEGORY COLUMN

### Create categories out of data: income_group column from income column.

In [None]:
# Using cut) - create category ranges and names
ranges = [0,200000,500000,пр.inf]
group_names = ['0-200K', '200K-500K', '500K+']
# Create income group column
demographics['income_group'1 = pd.cut(demographics['household_income'], bins=ranges,
labels=group_names)
demographicsIl'income_group', 'household_income'11

### Map categories to fewer ones: reducing categories in categorical column. operating_system column is: 'Microsoft', 'Macos', 'IOS', 'Android', 'Linux' operating_system column should become: 'DesktopS', 'Mobile0S'

In [None]:
# Create mapping dictionary and replace
mapping = {'Microsoft': 'Desktop0S', 'MacOS': 'Desktop0S', 'Linux': 'Desktop0S',
'IOS': 'MobileOS', 'Android':'Mobile0S'}
devices['operating_system'] = devices['operating_system'].replace (mapping)
devices['operating_system'].unique()
array(I'Desktop0S', 'Mobile0S'], dtype=object)

In [None]:
# Print unique values of both columns
print(airlines['dest_region'].unique())
print(airlines['dest_size'].unique())

# Lower dest_region column and then replace "eur" with "europe"
airlines['dest_region'] = airlines['dest_region'].str.lower() 
airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})

# Remove white spaces from `dest_size`
airlines['dest_size'] = airlines['dest_size'].str.strip()

# Verify changes have been effected
print(airlines['dest_size'].unique())
print(airlines['dest_region'].unique())

### example

In [None]:
# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']

# Create wait_type column
airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges, 
                               labels = label_names)

# Create mappings and replace
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 
            'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}

airlines['day_week'] = airlines['day'].replace(mappings)
print(airlines)

0       115.0
1       135.0
2        70.0
3       190.0
4       559.0
        ...  
2804    280.0
2805    165.0
2806     92.0
2807     95.0
2808    220.0
Name: wait_min, Length: 2477, dtype: float64

        id        day        airline        destination    dest_region  ...     cleanliness         safety        satisfaction  wait_type day_week
0     1351    Tuesday    UNITED INTL             KANSAI           Asia  ...           Clean        Neutral      Very satisfied     medium  weekday
1      373     Friday         ALASKA  SAN JOSE DEL CABO  Canada/Mexico  ...           Clean      Very safe      Very satisfied     medium  weekday
2     2820   Thursday          DELTA        LOS ANGELES        West US  ...         Average  Somewhat safe             Neutral     medium  weekday
3     1157    Tuesday      SOUTHWEST        LOS ANGELES        West US  ...           Clean      Very safe  Somewhat satsified       long  weekday
4     2992  Wednesday       AMERICAN              MIAMI        East US  ...  Somewhat clean      Very safe  Somewhat satsified       long  weekday
...    ...        ...            ...                ...            ...  ...             ...            ...                 ...        ...      ...
2804  1475    Tuesday         ALASKA       NEW YORK-JFK        East US  ...  Somewhat clean        Neutral  Somewhat satsified       long  weekday
2805  2222   Thursday      SOUTHWEST            PHOENIX        West US  ...           Clean      Very safe      Very satisfied     medium  weekday
2806  2684     Friday         UNITED            ORLANDO        East US  ...           Clean      Very safe      Very satisfied     medium  weekday
2807  2549    Tuesday        JETBLUE         LONG BEACH        West US  ...           Clean  Somewhat safe      Very satisfied     medium  weekday
2808  2162   Saturday  CHINA EASTERN            QINGDAO           Asia  ...           Clean      Very safe  Somewhat satsified       long  weekend

[2477 rows x 14 columns]

### Fixing the phone number column

In [None]:
# Replace "+" with "00"
phonesI"Phone number"] = phones[ "Phone number"].str.replace("+", "00")
phonesI"Phone number"] = phones[ "Phone number"].str.replace("-", "")

In [None]:
# Replace phone numbers with lower than 10 digits to NaN
digits = phones['Phone number'].str.len()
phones.loc[digits < 10, "Phone number"] = np.nan

In [None]:
# Find length of each row in Phone number column
sanity_check = phone['Phone number'].str.len()
# Assert minmum phone number length is 10
assert sanity_check.min) >= 10
# Assert all numbers do not have
"+" or "_"
assert phone['Phone number'].str.contains("+|-").any() == False

IF phones

Olga Robinson +(01706) -25891
1
Justina Kim
+0500-571437
2
Tamekah Henson
+0800-1111
3
Miranda Solis
+07058-879063
4
Caldwell Gilliam +(016977)-8424

In [None]:
# Replace letters with nothing
phones['Phone number'] = phones['Phone number'].str.replace(r'ID+', '')
phones.head()

### Example Dr, Mr, etc

In [None]:
# Replace "Dr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Dr.", "")

# Replace "Mr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Mr.", "")

# Replace "Miss" with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Miss", "")

# Replace "Ms." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Ms.", "")

# Assert that full_name has no honorifics
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False

# Uniformity

## Temperature

In [None]:
# Import matplotlib
import matplotlib.pyplot as plt
# Create scatter plot
plt.scatter(× = 'Date', y = 'Temperature', data = temperatures)
# Create title, xlabel and ylabel
plt.title('Temperature in Celsius March 2019 - NYC')
plt.xlabel('Dates')
plt.ylabel('Temperature in Celsius')
# Show plot
plt.show()

In [None]:
temp_fah = temperatures.Loc[temperatures['Temperature'] > 40, 'Temperature']
temp_cels = (temp_fah - 32) * (5/9)
temperatures.loc[temperatures['Temperature'] > 40, 'Temperature'] = temp_cels

In [None]:
# Assert conversion is correct
assert temperatures['Temperature'].max() < 40

## Birth Dates

In [None]:
# Will work!
birthdays['Birthday'] = pd.to_datetime(birthdays['Birthday'],
# Attempt to infer format of each date
infer_datetime_format=True,
# Return NA for rows where conversion failed
errors = 'coerce')

In [None]:
birthdays['Birthday'] = birthdays['Birthday'].dt.strftime ("%d-%m-%Y"]
birthdays.head()

## Money

In [None]:
# Find values of acct_cur that are equal to 'euro'
acct_eu = banking['acct_cur'] == 'euro'

# Convert acct_amount where it is in euro to dollars
banking.loc[acct_eu, 'acct_amount'] = banking.loc[acct_eu, 'acct_amount'] * 1.1 

# Unify acct_cur column by changing 'euro' values to 'dollar'
banking.loc[acct_eu, 'acct_cur'] = 'dollar'

# Assert that only dollar currency remains
assert banking['acct_cur'].unique() == 'dollar'

# Cross field validation

### Check age and date of birth from 2 datasets 

In [None]:
# Convert to datetime and get today's date
users['Birthday'] = pd.to_datetime(users['Birthday'])
today = dt.date.today()
# For each row in the Birthday column, calculate year difference
age_manual = today.year - users['Birthday'].dt.year
# Find instances where ages match
age_equ = age_manual == users['Age']
# Find and filter out rows with inconsistent age
inconsistent_age = users[~age_equ]
consistent_age = users[age_equ]

## Check total sum from 2 datasets

In [None]:
# Store fund columns to sum against
fund_columns = ['fund_A', 'fund_B', 'fund_C', 'fund_D']

# Find rows where fund_columns row sum == inv_amount
inv_equ = banking[fund_columns].sum(axis = 1) == banking['inv_amount']

# Store consistent and inconsistent data
consistent_inv = banking[inv_equ]
inconsistent_inv = banking[~inv_equ]

# Store consistent and inconsistent data
print("Number of inconsistent investments: ", inconsistent_inv.shape[0])

banking
Out[1]:

     cust_id birth_date  age  acct_amount  inv_amount  ...   fund_B   fund_C   fund_D  account_opened last_transaction
0   870A9281 1962-06-09   62     63523.31       51295  ...   4138.0   1420.0  15632.0        02-09-18         22-02-19
1   166B05B0 1962-12-16   62     38175.46       15050  ...    938.0   6696.0   2421.0        28-02-19         31-10-18
2   BFC13E88 1990-09-12   34     59863.77       24567  ...   4590.0   8469.0   1185.0        25-04-18         02-04-18
3   F2158F66 1985-11-03   39     84132.10       23712  ...    492.0   6482.0  12830.0        07-11-17         08-11-18
4   7A73F334 1990-05-17   40    120512.00       93230  ...  51281.0  13434.0  18383.0        14-05-18         19-07-18
..       ...        ...  ...          ...         ...  ...      ...      ...      ...             ...              ...
95  CA507BA1 1974-08-10   50     12209.84        7515  ...    931.0   1451.0   4943.0        26-05-18         11-09-19
96  B99CD662 1989-12-12   35     92838.44       49089  ...   7892.0  31486.0   7258.0        04-05-17         12-03-19
97  13770971 1984-11-29   40     92750.87       27962  ...   7547.0   8486.0   8577.0        16-08-17         24-04-19
98  93E78DA3 1969-12-14   55     41942.23       29662  ...  11174.0  11650.0   5080.0        09-10-17         15-04-18
99  AC91D689 1993-05-18   31     99490.61       32149  ...  17918.0   6714.0   5333.0        01-08-17         04-08-19

[100 rows x 11 columns

Number of inconsistent investments:  8

## Check date, age, and change if not

In [None]:
# Store today's date and find ages
today = dt.date.today()
ages_manual = today.year - banking['birth_date'].dt.year

# Find rows where age column == ages_manual
age_equ = banking['age'] == ages_manual

# Store consistent and inconsistent data
consistent_ages = banking[age_equ]
inconsistent_ages = banking[~age_equ]

# Store consistent and inconsistent data
print("Number of inconsistent ages: ", inconsistent_ages.shape[0])

# Missing Values

In [None]:
# false true of the missing values
df.isna()

In [None]:
# sum of the missing values
df.isna().sum()

## Useful package for visualizing and understanding missing data

In [None]:
import missingno as msno import matplotlib.pyplot as plt
# Visualize missingness
msno.matrix(airquality)
plt. show()

In [None]:
Steps:

In [None]:
# Isolate missing and complete values aside
missing = airqualitylairquality['C02'].isna()]
complete = airquality[~airquality['C02'].isna()]

In [None]:
complete.describe()
missing.describe()

In [None]:
# Sort banking by age and visualize
banking_sorted = banking.sort_values('age')
msno.matrix(banking_sorted)
plt.show()

### Dropping missing values

In [None]:
# Drop missing values
airquality_dropped = airquality.dropna(subset = ['CO2'])

### Replacing with statistical measures

In [None]:
co2_mean = airquality['C02'].mean()
airquality_imputed = airquality.fillna({'C02': co2_mean})
airquality_imputed.head()

# Comparing strings
## Simple string comparison

In [None]:
# Lets us compare between two strings (0-100) 100 - match (means 'the same')
from thefuzz import fuzz
# Compare reeding vs reading
fuzz.WRatio('Reeding', 'Reading')

In [None]:
What if we have a lot of categories with the similar words ('Cal', 'Cali', 'California', etc)

In [None]:
# For each correct category
for state in categories['state']:
# Find potential matches in states with typoes
matches = process.extract(state, survey['state'], limit = survey.shape[0])
# For each potential match match
for potential_match in matches:
# If high similarity score
if potential_match[1] >= 80:
# Replace typo with correct category
survey.loc[survey['state'] == potential_match[0], 'state'] = state

### example

In [None]:
# Import process from thefuzz
from thefuzz import process

# Store the unique values of cuisine_type in unique_types
unique_types = restaurants['cuisine_type'].unique()

# Calculate similarity of 'asian' to all values of unique_types
print(process.extract('asian', unique_types, limit = len(unique_types)))

# Calculate similarity of 'american' to all values of unique_types
print(process.extract('american', unique_types, limit = len(unique_types)))

# Calculate similarity of 'italian' to all values of unique_types
print(process.extract('italian', unique_types, limit = len(unique_types)))

In [None]:
# Create a list of matches, comparing 'italian' with the cuisine_type column
matches = process.extract('italian', restaurants['cuisine_type'], limit=len(restaurants.cuisine_type))

# Inspect the first 5 matches
print(matches[0:5])

In [None]:
# Iterate through the list of matches to italian
for match in matches:
  # Check whether the similarity score is greater than or equal to 80
  if match[1] >= 80:
    # Select all rows where the cuisine_type is spelled this way, and set them to the correct cuisine
    restaurants.loc[restaurants['cuisine_type'] == match[0]] = 'italian'

In [None]:
# Iterate through categories
for cuisine in categories:  
  # Create a list of matches, comparing cuisine with the cuisine_type column
  matches = process.extract(cuisine, restaurants['cuisine_type'], limit=len(restaurants.cuisine_type))

  # Iterate through the list of matches
  for match in matches:
     # Check whether the similarity score is greater than or equal to 80
    if match[1] >= 80:
      # If it is, select all rows where the cuisine_type is spelled this way, and set them to the correct cuisine
      restaurants.loc[restaurants['cuisine_type'] == match[0]] = cuisine
      
# Inspect the final result
print(restaurants['cuisine_type'].unique())

# Generating pairs

In [None]:
# Import recordlinkage
import recordlinkage
# Create indexing object
indexer = recordlinkage.Index()
# Generate pairs blocked on state
indexer.block('state')
pairs = indexer.index(census_A, census_B)

In [None]:
# Create a Compare object
compare_cl = recordlinkage.Compare()
# Find exact matches for pairs of date_of_birth and state
compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
compare_cl.exact('state', 'state' , label=' state')
# Find similar matches for pairs of surname and address_1 using string similarity
compare_cl.string('surname', 'surname', threshold=0.85, label='surname')
compare_cl.string('address_1', 'address_1', threshold=0.85, label='address_1')
# Find matches
potential_matches = compare_cl.compute(pairs, census_A, census_B)

In [None]:
# Isolate matches with matching values for 3 or more columns
matches = potential_matches[potential_matches.sum(axis = 1) >= 3]
# Get index for matching census_B rows only
duplicate_rows = matches.index.get_level_values(1)
# Finding new rows in census_B
census_B_new = census_B[~census_B.index.isin(duplicate_rows)]
# Link the DataFrames!
full_census = census_A.append(census_B_new)

#
#

# Iterating over iterables: next

In [70]:
word = 'Da'
it = iter (word)
next(it)

'D'

In [72]:
print(*it)

a


## example

In [75]:
# Create a list of strings: flash
flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen']

# Print each list item in flash using a for loop
for person in flash:
    print(person)

# Create an iterator for flash: superhero
superhero = iter(flash)

# Print each item from the iterator
print(next(superhero))
print(next(superhero))
print(next(superhero))
print(next(superhero))

jay garrick
barry allen
wally west
bart allen
jay garrick
barry allen
wally west
bart allen


In [77]:
# Create an iterator for range(3): small_value
small_value = iter(range(3))

# Print the values in small_value
print(next(small_value))
print(next(small_value))
print(next(small_value))

# Loop over range(3) and print the values
for num in range(3):
    print(num)

# Create an iterator for range(10 ** 100): googol
googol = iter(range(10 ** 100))

# Print the first 5 values from googol
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))

0
1
2
0
1
2
0
1
2
3
4


## enumerate and unpack

In [86]:
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
for index, value in enumerate(avengers):
    print(index, value)

0 hawkeye
1 iron man
2 thor
3 quicksilver


In [88]:
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
for index, value in enumerate(avengers, start = 10):
    print(index, value)

10 hawkeye
11 iron man
12 thor
13 quicksilver


In [1]:
# Create a list of strings: mutants
mutants = ['charles xavier', 
            'bobby drake', 
            'kurt wagner', 
            'max eisenhardt', 
            'kitty pryde']

# Create a list of tuples: mutant_list
mutant_list = list(enumerate(mutants))

# Print the list of tuples
print(mutant_list)

# Unpack and print the tuple pairs
for index1, value1 in enumerate(mutants):
    print(index1, value1)

# Change the start index
for index2, value2 in enumerate(mutants, start=1):
    print(index2, value2)

[(0, 'charles xavier'), (1, 'bobby drake'), (2, 'kurt wagner'), (3, 'max eisenhardt'), (4, 'kitty pryde')]
0 charles xavier
1 bobby drake
2 kurt wagner
3 max eisenhardt
4 kitty pryde
1 charles xavier
2 bobby drake
3 kurt wagner
4 max eisenhardt
5 kitty pryde


In [None]:
# Create a list of tuples: mutant_data
mutant_data = list(zip(mutants, aliases, powers))

# Print the list of tuples
print(mutant_data)

# Create a zip object using the three lists: mutant_zip
mutant_zip = zip(mutants, aliases, powers)

# Print the zip object
print(mutant_zip)

# Unpack the zip object and print the tuple values
for value1, value2, value3 in mutant_zip:
    print(value1, value2, value3)

# Using iterators to load large files into memory (in chunks)

In [None]:
#Iterating over data 
import pandas as pd
result = ［］
for chunk in pd.read_csv('data.csv', chunksize=1000):
result.append(sum(chunk['x']))
total = sum(result)
print(total)

In [None]:
import pandas as pd
total = 0
for chunk in pd.read_csv('data.csv', chunksize=1000):
total += sum(chunk['x'])
print(total)

In [None]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Iterate over the file chunk by chunk
for chunk in pd.read_csv('tweets.csv', chunksize=10):

    # Iterate over the column in DataFrame
    for entry in chunk['lang']:
        if entry in counts_dict.keys():
            counts_dict[entry] += 1
        else:
            counts_dict[entry] = 1

# Print the populated dictionary
print(counts_dict)


In [None]:
# Define count_entries()
def count_entries(csv_file, c_size, colname):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Iterate over the file chunk by chunk
    for chunk in pd.read_csv(csv_file, chunksize=c_size):

        # Iterate over the column in DataFrame
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1

    # Return counts_dict
    return counts_dict

# Call count_entries(): result_counts
result_counts = count_entries('tweets.csv', 10, 'lang')

# Print result_counts
print(result_counts)

# A list comprehension

In [1]:
nums = [12, 8, 21, 3, 16]
new_nums = [num + 1 for num in nums]
print (new_nums)

[13, 9, 22, 4, 17]


In [3]:
result = [num for num in range(11)]
print (result)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [5]:
pairs_2 = [(num1, num2) for num1 in range(0, 2) for num2 in range(6, 8)]
print (pairs_2)

[(0, 6), (0, 7), (1, 6), (1, 7)]


In [7]:
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]

# Print the matrix
for row in matrix:
    print(row)


[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


In [9]:
#Conditionals on the iterable
[num ** 2 for num in range(10) if num % 2 == 0]

[0, 4, 16, 36, 64]

In [13]:
# Conditionals on the output expression
[num ** 2 if num % 2 == 0 else 0 for num in range(10)]

[0, 0, 4, 0, 16, 0, 36, 0, 64, 0]

In [15]:
pos_neg = {num: -num for num in range(9)}
print(pos_neg)

{0: 0, 1: -1, 2: -2, 3: -3, 4: -4, 5: -5, 6: -6, 7: -7, 8: -8}


In [17]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member for member in fellowship if len(member) >= 7]

# Print the new list
print(new_fellowship)

['samwise', 'aragorn', 'legolas', 'boromir']


In [None]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member if len(member) >=7 else '' for member in fellowship]

# Print the new list
print(new_fellowship)


In [19]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create dict comprehension: new_fellowship
new_fellowship = { member:len(member) for member in fellowship }

# Print the new dictionary
print(new_fellowship)

{'frodo': 5, 'samwise': 7, 'merry': 5, 'aragorn': 7, 'legolas': 7, 'boromir': 7, 'gimli': 5}


In [23]:
# Lazy valuations
result = (num for num in range(6))

0


In [27]:
 print (next(result))

2


In [31]:
def num_sequence(n) :
# Generate values from 0 to n.
i = 0
while i < n:
yield i 
i += 1

IndentationError: expected an indented block after function definition on line 1 (4293202636.py, line 3)

In [33]:
# Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# Define generator function get_lengths
def get_lengths(input_list):
    """Generator function that yields the
    length of the strings in input_list."""

    # Yield the length of a string
    for person in input_list:
        yield len(person)

# Print the values generated by get_lengths()
for value in get_lengths(lannister):
    print(value)

6
5
5
6
7


## Examples

In [None]:
# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time]

# Print the extracted times
print(tweet_clock_time)

['23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19']

In [38]:
# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']

# Print the extracted times
print(tweet_clock_time)

NameError: name 'df' is not defined

  ['23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19']

In [None]:
# Define lists2dict()
def lists2dict(list1, list2):
    """Return a dictionary where list1 provides
    the keys and list2 provides the values."""

    # Zip lists: zipped_lists
    zipped_lists = zip(list1, list2)

    # Create a dictionary: rs_dict
    rs_dict = dict(zipped_lists)

    # Return the dictionary
    return rs_dict

# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names,row_vals)

# Print rs_fxn
print(rs_fxn)

In [None]:
# Import the pandas package
import pandas as pd

# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]

# Turn list of dicts into a DataFrame: df
df = pd.DataFrame(list_of_dicts)

# Print the head of the DataFrame
df.head()


In [None]:
# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Skip the column names
    file.readline()

    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Process only the first 1000 rows
    for j in range(1000):

        # Split the current line into a list: line
        line = file.readline().split(',')

        # Get the value for the first column: first_col
        first_col = line[0]

        # If the column value is in the dict, increment its value
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1

        # Else, add to the dict and set value to 1
        else:
            counts_dict[first_col] = 1

# Print the resulting dictionary
print(counts_dict)

In [None]:
# Define read_large_file()
def read_large_file(file_object):
    """A generator function to read a large file lazily."""

    # Loop indefinitely until the end of the file
    while True:

        # Read a line from the file: data
        data = file_object.readline()

        # Break if this is the end of the file
        if not data:
            break

        # Yield the line of data
        yield data

# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Create a generator object for the file: gen_file
    gen_file = read_large_file(file)

    # Print the first three lines of the file
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))

In [None]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Iterate over the generator from read_large_file()
    for line in read_large_file(file):

        row = line.split(',')
        first_col = row[0]

        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

# Print            
print(counts_dict)

In [None]:
# Import the pandas package
import pandas as pd

# Initialize reader object: df_reader
df_reader = pd.read_csv('ind_pop.csv', chunksize=10)

# Print two chunks
print(next(df_reader))
print(next(df_reader))


                                 CountryName CountryCode                  IndicatorName      IndicatorCode  Year   Value
0                                 Arab World         ARB  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  31.285
1                     Caribbean small states         CSS  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  31.597
2             Central Europe and the Baltics         CEB  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  44.508
3    East Asia & Pacific (all income levels)         EAS  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  22.471
4      East Asia & Pacific (developing only)         EAP  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  16.918
5                                  Euro area         EMU  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  62.097
6  Europe & Central Asia (all income levels)         ECS  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  55.379
7    Europe & Central Asia (developing only)         ECA  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  38.066
8                             European Union         EUU  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  61.213
9   Fragile and conflict affected situations         FCS  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  17.892
                                      CountryName CountryCode                  IndicatorName      IndicatorCode  Year   Value
10         Heavily indebted poor countries (HIPC)         HPC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  12.236
11                                    High income         HIC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  62.680
12                           High income: nonOECD         NOC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  56.108
13                              High income: OECD         OEC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  64.285
14  Latin America & Caribbean (all income levels)         LCN  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  49.285
15    Latin America & Caribbean (developing only)         LAC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  44.863
16   Least developed countries: UN classification         LDC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960   9.616
17                            Low & middle income         LMY  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  21.273
18                                     Low income         LIC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  11.498
19                            Lower middle income         LMC  Urban population (% of total)  SP.URB.TOTL.IN.ZS  1960  19.811

In [None]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)

# Get the first DataFrame chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)

# Check out the head of the DataFrame
print(df_urb_pop.head())

# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'], 
           df_pop_ceb['Urban population (% of total)'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Print pops_list
print(pops_list)

                               CountryName CountryCode  Year  Total Population  Urban population (% of total)
0                               Arab World         ARB  1960         9.250e+07                         31.285
1                   Caribbean small states         CSS  1960         4.191e+06                         31.597
2           Central Europe and the Baltics         CEB  1960         9.140e+07                         44.508
3  East Asia & Pacific (all income levels)         EAS  1960         1.042e+09                         22.471
4    East Asia & Pacific (developing only)         EAP  1960         8.965e+08                         16.918
[(91401583.0, 44.5079211390026), (92237118.0, 45.206665319194), (93014890.0, 45.866564696018), (93845749.0, 46.5340927663649), (94722599.0, 47.2087429803526)]

In [None]:
# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]

# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

In the previous exercises, you've only processed the data from the first DataFrame chunk. 
### This time, you will aggregate the results over all the DataFrame chunks in the dataset. 
This basically means you will be processing the entire dataset now. This is neat because you're going to be able to process the entire large dataset by just working on smaller pieces of it!

In [None]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)

# Initialize empty DataFrame: data
data = pd.DataFrame()

# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:

    # Check out specific country: df_pop_ceb
    df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

    # Zip DataFrame columns of interest: pops
    pops = zip(df_pop_ceb['Total Population'],
                df_pop_ceb['Urban population (% of total)'])

    # Turn zip object into list: pops_list
    pops_list = list(pops)

    # Use list comprehension to create new DataFrame column 'Total Urban Population'
    df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
    
    # Concatenate DataFrame chunk to the end of data: data
    data = pd.concat([data, df_pop_ceb])

# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

In [None]:
# Define plot_pop()
def plot_pop(filename, country_code):

    # Initialize reader object: urb_pop_reader
    urb_pop_reader = pd.read_csv(filename, chunksize=1000)

    # Initialize empty DataFrame: data
    data = pd.DataFrame()
    
    # Iterate over each DataFrame chunk
    for df_urb_pop in urb_pop_reader:
        # Check out specific country: df_pop_ceb
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]

        # Zip DataFrame columns of interest: pops
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])

        # Turn zip object into list: pops_list
        pops_list = list(pops)

        # Use list comprehension to create new DataFrame column 'Total Urban Population'
        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
        
        # Concatenate DataFrame chunk to the end of data: data
        data = pd.concat([data, df_pop_ceb])

    # Plot urban population data
    data.plot(kind='scatter', x='Year', y='Total Urban Population')
    plt.show()

# Set the filename: fn
fn = 'ind_pop_data.csv'

# Call plot_pop for country code 'CEB'
plot_pop(fn, 'CEB')

# Call plot_pop for country code 'ARB'
plot_pop(fn, 'ARB')