In [1]:
# "pandas" is the module name
# "pd" is the alias for "pandas"

import pandas as pd

In [2]:
# Creating a new dataFrame from list of dictionaries

df_lod = pd.DataFrame([{"City":"Jersey City","State":"New Jersey"},
                      {"City":"Morristown","State":"New Jersey"},
                      {"City":"St. Louis","State":"Missoury"},
                      {"City":"Rome City","State":"Indiana"}])
df_lod

Unnamed: 0,City,State
0,Jersey City,New Jersey
1,Morristown,New Jersey
2,St. Louis,Missoury
3,Rome City,Indiana


In [4]:
# Creating a new dataFrame from dictionary of lists
# NOTE this will return an error
df_dol = pd.DataFrame({"City":["Doylestown"], "State":"Pennsylvania"})

In [5]:
# Creating a new dataFrame from dictionary of lists
# NOTE this is corrected
df_dol = pd.DataFrame({"City":["Doylestown"], "State":["Pennsylvania"]})
df_dol

Unnamed: 0,City,State
0,Doylestown,Pennsylvania


In [6]:
import numpy as np

In [7]:
np.random.randint(20000,300000)

140890

In [8]:
df_1 = pd.DataFrame({"case number":range(10,50),"accounts":[np.random.randint(200000,300000) for i in range(10,50)]})


In [9]:
df_2 = pd.DataFrame({"case number":range(10,50),"value":["$"+str(np.random.randint(200,1000000)) for i in range(10,50)]})
df_2.head()

Unnamed: 0,case number,value
0,10,$463298
1,11,$867327
2,12,$885136
3,13,$115269
4,14,$847979


In [10]:
df_3 = df_2.set_index("case number").join(df_1.set_index("case number"))
df_3.head()

Unnamed: 0_level_0,value,accounts
case number,Unnamed: 1_level_1,Unnamed: 2_level_1
10,$463298,275525
11,$867327,217081
12,$885136,268104
13,$115269,253537
14,$847979,249752


In [11]:
df_2['value'].str.replace("$","",regex=True).map(float)
df_2['Value (float)'] = df_2['value'].str.replace("$","",regex=True).map(float)
df_2.head()

Unnamed: 0,case number,value,Value (float)
0,10,$463298,463298.0
1,11,$867327,867327.0
2,12,$885136,885136.0
3,13,$115269,115269.0
4,14,$847979,847979.0


In [12]:
# Creating categories using bins

#bins are length N
bins = [200,100000,300000,500000,700000,900000,1000000]

#group labels (categories) are length N-1
group_labels=["200-100K","100K-300K","300K-500K","500K-700K","700K-900K","900K-1M"]

df_2["Wealth ranges"] = pd.cut(df_2['Value (float)'],bins,labels=group_labels)
df_2.head()

Unnamed: 0,case number,value,Value (float),Wealth ranges
0,10,$463298,463298.0,300K-500K
1,11,$867327,867327.0,700K-900K
2,12,$885136,885136.0,700K-900K
3,13,$115269,115269.0,100K-300K
4,14,$847979,847979.0,700K-900K


In [13]:
# Sorting these fields by value ascending
df_2.sort_values("Value (float)")


Unnamed: 0,case number,value,Value (float),Wealth ranges
32,42,$10411,10411.0,200-100K
8,18,$57505,57505.0,200-100K
27,37,$77793,77793.0,200-100K
18,28,$78530,78530.0,200-100K
3,13,$115269,115269.0,100K-300K
26,36,$126586,126586.0,100K-300K
22,32,$144266,144266.0,100K-300K
15,25,$154497,154497.0,100K-300K
17,27,$171428,171428.0,100K-300K
25,35,$176725,176725.0,100K-300K


In [14]:
# Sorting these fields by value descending
df_2.sort_values("Value (float)",ascending=False)

Unnamed: 0,case number,value,Value (float),Wealth ranges
16,26,$886813,886813.0,700K-900K
2,12,$885136,885136.0,700K-900K
39,49,$875250,875250.0,700K-900K
33,43,$874207,874207.0,700K-900K
1,11,$867327,867327.0,700K-900K
4,14,$847979,847979.0,700K-900K
38,48,$817791,817791.0,700K-900K
10,20,$813918,813918.0,700K-900K
36,46,$765227,765227.0,700K-900K
35,45,$738874,738874.0,700K-900K


In [15]:
# Grouping data by weath
df_2.groupby("Wealth ranges").count()
#df_2.groupby("Wealth ranges").sum()
# df_2.groupby("Wealth ranges").mean()


Unnamed: 0_level_0,case number,value,Value (float)
Wealth ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200-100K,4,4,4
100K-300K,12,12,12
300K-500K,6,6,6
500K-700K,7,7,7
700K-900K,11,11,11
900K-1M,0,0,0
