# Introduction to Data Wrangling with Python

In [1]:
# import libraries
import pandas as pd

In [2]:
# create a list
ssn = list(pd.read_csv('datasets/ssn.csv'))
ssn

['218-68-9955',
 '165-73-3124',
 '432-47-4043',
 '563-93-1393',
 '153-93-3401',
 '670-09-7369',
 '123-05-9652',
 '812-13-2476',
 '726-13-1007',
 '825-05-4836']

In [3]:
# accessing the first element
ssn[0]

'218-68-9955'

In [4]:
# accessing the last element
ssn[-1]

'825-05-4836'

In [5]:
# Access the first three elements of ssn
ssn[0:3]

['218-68-9955', '165-73-3124', '432-47-4043']

In [6]:
# Access the last two elements of ssn
ssn[-2:]

['726-13-1007', '825-05-4836']

In [7]:
# Reverse the elements in the list
ssn[-1::-1]

['825-05-4836',
 '726-13-1007',
 '812-13-2476',
 '123-05-9652',
 '670-09-7369',
 '153-93-3401',
 '563-93-1393',
 '432-47-4043',
 '165-73-3124',
 '218-68-9955']

In [8]:
# Create a list using the append method
ssn_2 = []
for element in ssn:
    ssn_2.append(element)

ssn_2

['218-68-9955',
 '165-73-3124',
 '432-47-4043',
 '563-93-1393',
 '153-93-3401',
 '670-09-7369',
 '123-05-9652',
 '812-13-2476',
 '726-13-1007',
 '825-05-4836']

In [9]:
# Generating a new list by list comprehension
ssn_3 = ["soc: " + x for x in ssn_2]
ssn_3

['soc: 218-68-9955',
 'soc: 165-73-3124',
 'soc: 432-47-4043',
 'soc: 563-93-1393',
 'soc: 153-93-3401',
 'soc: 670-09-7369',
 'soc: 123-05-9652',
 'soc: 812-13-2476',
 'soc: 726-13-1007',
 'soc: 825-05-4836']

In [10]:
# Search all the social security numbers with the number 5 in them
numbers = [x for x in ssn_3 if "5" in x]
numbers

['soc: 218-68-9955',
 'soc: 165-73-3124',
 'soc: 563-93-1393',
 'soc: 153-93-3401',
 'soc: 123-05-9652',
 'soc: 825-05-4836']

In [11]:
# Generate a list by adding the two lists
ssn_4 = ["102-90-0314" , "247-17-2338" , "318-22-2760"]
ssn_5 = ssn_4 + ssn
ssn_5

['102-90-0314',
 '247-17-2338',
 '318-22-2760',
 '218-68-9955',
 '165-73-3124',
 '432-47-4043',
 '563-93-1393',
 '153-93-3401',
 '670-09-7369',
 '123-05-9652',
 '812-13-2476',
 '726-13-1007',
 '825-05-4836']

In [12]:
# Extend a string using the extend keyword
ssn_2.extend(ssn_4)
ssn_2

['218-68-9955',
 '165-73-3124',
 '432-47-4043',
 '563-93-1393',
 '153-93-3401',
 '670-09-7369',
 '123-05-9652',
 '812-13-2476',
 '726-13-1007',
 '825-05-4836',
 '102-90-0314',
 '247-17-2338',
 '318-22-2760']

In [13]:
# Loop over the first list and create a nested list inside that loop that goes over the second list
for x in ssn_2:
    for y in ssn_5:
        print(str(x) + ' , ' + str(y))

218-68-9955 , 102-90-0314
218-68-9955 , 247-17-2338
218-68-9955 , 318-22-2760
218-68-9955 , 218-68-9955
218-68-9955 , 165-73-3124
218-68-9955 , 432-47-4043
218-68-9955 , 563-93-1393
218-68-9955 , 153-93-3401
218-68-9955 , 670-09-7369
218-68-9955 , 123-05-9652
218-68-9955 , 812-13-2476
218-68-9955 , 726-13-1007
218-68-9955 , 825-05-4836
165-73-3124 , 102-90-0314
165-73-3124 , 247-17-2338
165-73-3124 , 318-22-2760
165-73-3124 , 218-68-9955
165-73-3124 , 165-73-3124
165-73-3124 , 432-47-4043
165-73-3124 , 563-93-1393
165-73-3124 , 153-93-3401
165-73-3124 , 670-09-7369
165-73-3124 , 123-05-9652
165-73-3124 , 812-13-2476
165-73-3124 , 726-13-1007
165-73-3124 , 825-05-4836
432-47-4043 , 102-90-0314
432-47-4043 , 247-17-2338
432-47-4043 , 318-22-2760
432-47-4043 , 218-68-9955
432-47-4043 , 165-73-3124
432-47-4043 , 432-47-4043
432-47-4043 , 563-93-1393
432-47-4043 , 153-93-3401
432-47-4043 , 670-09-7369
432-47-4043 , 123-05-9652
432-47-4043 , 812-13-2476
432-47-4043 , 726-13-1007
432-47-4043 

In [14]:
# import data
car_models = list(pd.read_csv('datasets/car_models.csv'))
car_models

['Escalade ',
 ' X5 M',
 'D150',
 'Camaro',
 'F350',
 'Aurora',
 'S8',
 'E350',
 'Tiburon',
 'F-Series Super Duty ']

In [15]:
# Iterate over a list
list_1 = [x for x in car_models]
for i in list_1:
    print(i)

Escalade 
 X5 M
D150
Camaro
F350
Aurora
S8
E350
Tiburon
F-Series Super Duty 


In [16]:
# Use the sort method with reverse=True
list_1 = [*range(0,21,1)]
list_1.sort(reverse=True)
list_1

[20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [17]:
# Use the reverse method directly
list_1.reverse()
list_1

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [18]:
# import library
import random

In [21]:
# Use the randint method to generate some random integers and add them to a list
list_1 = [random.randint(0,30) for x in range (0,100)]
print(list_1)

[19, 19, 13, 10, 28, 22, 6, 8, 15, 21, 16, 21, 29, 21, 5, 25, 6, 0, 11, 4, 17, 12, 26, 2, 6, 27, 15, 2, 13, 23, 16, 2, 26, 12, 26, 25, 0, 21, 14, 3, 14, 9, 23, 21, 9, 23, 13, 3, 14, 24, 6, 0, 5, 0, 3, 30, 29, 20, 7, 0, 16, 20, 18, 8, 20, 17, 4, 5, 22, 26, 21, 25, 7, 12, 24, 25, 5, 18, 5, 16, 15, 0, 28, 30, 21, 24, 13, 24, 13, 0, 4, 1, 3, 19, 0, 8, 9, 10, 22, 26]


In [22]:
# let's find the log of the elements of list_1
import math
list_2 = [math.log(x+1,10) for x in list_1]
print(list_2)

[1.301029995663981, 1.301029995663981, 1.1461280356782377, 1.041392685158225, 1.462397997898956, 1.3617278360175928, 0.8450980400142567, 0.9542425094393249, 1.2041199826559246, 1.3424226808222062, 1.2304489213782739, 1.3424226808222062, 1.4771212547196624, 1.3424226808222062, 0.7781512503836435, 1.414973347970818, 0.8450980400142567, 0.0, 1.0791812460476247, 0.6989700043360187, 1.2552725051033058, 1.1139433523068367, 1.4313637641589871, 0.47712125471966244, 0.8450980400142567, 1.447158031342219, 1.2041199826559246, 0.47712125471966244, 1.1461280356782377, 1.380211241711606, 1.2304489213782739, 0.47712125471966244, 1.4313637641589871, 1.1139433523068367, 1.4313637641589871, 1.414973347970818, 0.0, 1.3424226808222062, 1.1760912590556811, 0.6020599913279623, 1.1760912590556811, 1.0, 1.380211241711606, 1.3424226808222062, 1.0, 1.380211241711606, 1.1461280356782377, 0.6020599913279623, 1.1760912590556811, 1.3979400086720375, 0.8450980400142567, 0.0, 0.7781512503836435, 0.0, 0.60205999132796

In [30]:
# Consider the dictionary
stocks = {"Solar Capital Ltd.":"$920.44M",    
        "Zoe's Kitchen, Inc.":"$262.32M",
        "Toyota Motor Corp Ltd Ord":"$156.02B",    
        "Nuveen Virginia Quality Municipal Income Fund":"$238.33M",
        "Kinross Gold Corporation":"$5.1B",
        "Vulcan Materials Company":"$17.1B",
        "Hi-Crush Partners LP":"$955.69M",
        "Lennox International, Inc.":"$8.05B",
        "WMIH Corp.":"$247.66M",
        "Comerica Incorporated":"n/a"
}

In [31]:
# Remove the $ character from the stocks dictionary
for key, value in stocks.items():
    stocks[key] = value.replace("$", "")

stocks

{'Solar Capital Ltd.': '920.44M',
 "Zoe's Kitchen, Inc.": '262.32M',
 'Toyota Motor Corp Ltd Ord': '156.02B',
 'Nuveen Virginia Quality Municipal Income Fund': '238.33M',
 'Kinross Gold Corporation': '5.1B',
 'Vulcan Materials Company': '17.1B',
 'Hi-Crush Partners LP': '955.69M',
 'Lennox International, Inc.': '8.05B',
 'WMIH Corp.': '247.66M',
 'Comerica Incorporated': 'n/a'}

In [34]:
# Iterate over the stocks dictionary again and split the value into a list with price (val) and 
# multiplier (mult) as separate elements where a single value is assigned to each key
for key, value in stocks.items():
    mult = value[-1]
    stocks[key] = [value[:-1], mult]

stocks

{'Solar Capital Ltd.': ['920.44', 'M'],
 "Zoe's Kitchen, Inc.": ['262.32', 'M'],
 'Toyota Motor Corp Ltd Ord': ['156.02', 'B'],
 'Nuveen Virginia Quality Municipal Income Fund': ['238.33', 'M'],
 'Kinross Gold Corporation': ['5.1', 'B'],
 'Vulcan Materials Company': ['17.1', 'B'],
 'Hi-Crush Partners LP': ['955.69', 'M'],
 'Lennox International, Inc.': ['8.05', 'B'],
 'WMIH Corp.': ['247.66', 'M'],
 'Comerica Incorporated': ['n/', 'a']}

In [35]:
# generate a random list with duplicate values
import random

list_1 = [random.randint(0,30) for x in range(100)]

In [36]:
# Create a unique valued list from list_1
list(dict.fromkeys(list_1).keys())

[21,
 3,
 16,
 4,
 24,
 14,
 25,
 18,
 2,
 20,
 12,
 8,
 29,
 10,
 6,
 0,
 5,
 1,
 19,
 27,
 9,
 23,
 7,
 30,
 28,
 26,
 13,
 22,
 11,
 15]

In [37]:
# Create list_1 with five elements
dict_1 = {"key1": 1, "key2": ["list_element1", 34], "key3": "value3",
          "key4": {"subkey1": "v1"}, "key5": 4.5}

In [38]:
# use the del function and specify the element we want to delete
del dict_1["key2"]
dict_1

{'key1': 1, 'key3': 'value3', 'key4': {'subkey1': 'v1'}, 'key5': 4.5}

In [41]:
# Generate a dict that has 0 to 9 as the keys and the square of the key as the values
list_1 = [x for x in range(10)]
dict_1 = {x:x**2 for x in list_1}
dict_1

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}

In [42]:
# Generate a dictionary using the dict function
dict_2 = dict([('Tom', 100), ('Dick', 200), ('Harry', 300)])
dict_2

{'Tom': 100, 'Dick': 200, 'Harry': 300}

In [43]:
# generate a dictionary using the dict function
dict_3 = dict(Tom=100, Dick=200, Harry=300)
dict_3

{'Tom': 100, 'Dick': 200, 'Harry': 300}

In [45]:
# Create a string called str_1
str_1 = 'Hello World'
str_1

'Hello World'

In [49]:
# Create a string and convert it into a list using the split method
str_1 = "Name, Age, Sex, Address"
list_1 = str_1.split(',')
list_1

['Name', ' Age', ' Sex', ' Address']

In [50]:
# Combine this list into another string using the join method
s = ' | '
s.join(list_1)

'Name |  Age |  Sex |  Address'