In [None]:
# What is the point of Pandas?
# A python package for data manipulation and can also be used for data visualization

# Pandas is built on Numpy and Matplotlib
# It is designed to work with tabular / rectangular data

# in pandas, rectangular data is presented as a dataframe, for comparision in SQL this is just a visualized table


In [14]:
import pandas as pd

data = {
    "region": ["East South Central", "Pacific", "Mountain", "West South Central", "Pacific", "Mountain", "New England",
               "South Atlantic", "South Atlantic", "South Atlantic", "South Atlantic", "Pacific", "Mountain",
               "East North Central", "East North Central", "West North Central", "West North Central", 
               "East South Central", "West South Central", "New England", "South Atlantic", "New England", 
               "East North Central", "West North Central", "East South Central", "West North Central", "Mountain", 
               "West North Central", "Mountain", "New England", "Mid-Atlantic", "Mountain", "Mid-Atlantic", 
               "South Atlantic", "West North Central", "East North Central", "West South Central", "Pacific", 
               "Mid-Atlantic", "New England", "South Atlantic", "West North Central", "East South Central", 
               "West South Central", "Mountain", "New England", "South Atlantic", "Pacific", "South Atlantic", 
               "East North Central", "Mountain"],
    "state": ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
              "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
              "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", 
              "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", 
              "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", 
              "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", 
              "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"],
    "individuals": [2570, 1434, 7259, 2280, 109008, 7607, 2280, 708, 3770, 21443, 6943, 4131, 1297, 6752, 3776, 
                    1711, 1443, 2735, 2540, 1450, 4914, 6811, 5209, 3993, 1024, 3776, 983, 1745, 7058, 835, 6048, 
                    1949, 39827, 6451, 467, 6929, 2823, 11139, 8163, 747, 3082, 836, 6139, 19199, 1904, 780, 3928, 
                    16424, 1021, 2740, 434],
    "family_members": [864, 582, 2606, 432, 20964, 3250, 1696, 374, 3134, 9587, 2556, 2399, 715, 3891, 1482, 1038, 
                       773, 953, 519, 1066, 2230, 13257, 3142, 3250, 328, 2107, 422, 676, 486, 615, 3350, 602, 
                       52070, 2817, 75, 3320, 1048, 3337, 5349, 354, 851, 323, 1744, 6111, 972, 511, 2047, 5880, 
                       222, 2167, 205],
    "state_pop": [4887681, 735139, 7158024, 3009733, 39461588, 5691287, 3571520, 965479, 701547, 21244317, 10511131, 
                  1420593, 1750536, 12723071, 6695497, 3148618, 2911359, 4461153, 4659690, 1339057, 6035802, 6882635, 
                  9984072, 5606249, 2981020, 6121623, 1060665, 1925614, 3027341, 1353465, 8886025, 2092741, 19530351, 
                  10381615, 758080, 11676341, 3940235, 4181886, 12800922, 1058287, 5084156, 878698, 6771631, 28628666, 
                  3153550, 624358, 8501286, 7523869, 1804291, 5807406, 577601]
}

homelessness = pd.DataFrame(data)

# homelessness is a DataFrame containing estimates of homelessness in each U.S. state in 2018. 
# The individual column is the number of homeless individuals not part of a family with children. 
# The family_members column is the number of homeless individuals part of a family with children. 
# The state_pop column is the state's total population.


In [10]:
# returns the first few rows (the “head” of the DataFrame).
print(homelessness.head())


               region       state  individuals  family_members  state_pop
0  East South Central     Alabama         2570             864    4887681
1             Pacific      Alaska         1434             582     735139
2            Mountain     Arizona         7259            2606    7158024
3  West South Central    Arkansas         2280             432    3009733
4             Pacific  California       109008           20964   39461588


In [11]:
#  shows information on each of the columns, such as the data type and number of missing values.
print(homelessness.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   region          51 non-null     object
 1   state           51 non-null     object
 2   individuals     51 non-null     int64 
 3   family_members  51 non-null     int64 
 4   state_pop       51 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 2.1+ KB
None


In [12]:
# returns the number of rows and columns of the DataFrame.
print(homelessness.shape)

(51, 5)


In [13]:
# calculates a few summary statistics for each column.
print(homelessness.describe())

         individuals  family_members     state_pop
count      51.000000       51.000000  5.100000e+01
mean     7225.784314     3504.882353  6.405637e+06
std     15991.025083     7805.411811  7.327258e+06
min       434.000000       75.000000  5.776010e+05
25%      1446.500000      592.000000  1.777414e+06
50%      3082.000000     1482.000000  4.461153e+06
75%      6781.500000     3196.000000  7.340946e+06
max    109008.000000    52070.000000  3.946159e+07


In [55]:
#  A two-dimensional NumPy array of values.
print(homelessness.values)

[['East South Central' 'Alabama' 2570 864 4887681]
 ['Pacific' 'Alaska' 1434 582 735139]
 ['Mountain' 'Arizona' 7259 2606 7158024]
 ['West South Central' 'Arkansas' 2280 432 3009733]
 ['Pacific' 'California' 109008 20964 39461588]
 ['Mountain' 'Colorado' 7607 3250 5691287]
 ['New England' 'Connecticut' 2280 1696 3571520]
 ['South Atlantic' 'Delaware' 708 374 965479]
 ['South Atlantic' 'District of Columbia' 3770 3134 701547]
 ['South Atlantic' 'Florida' 21443 9587 21244317]
 ['South Atlantic' 'Georgia' 6943 2556 10511131]
 ['Pacific' 'Hawaii' 4131 2399 1420593]
 ['Mountain' 'Idaho' 1297 715 1750536]
 ['East North Central' 'Illinois' 6752 3891 12723071]
 ['East North Central' 'Indiana' 3776 1482 6695497]
 ['West North Central' 'Iowa' 1711 1038 3148618]
 ['West North Central' 'Kansas' 1443 773 2911359]
 ['East South Central' 'Kentucky' 2735 953 4461153]
 ['West South Central' 'Louisiana' 2540 519 4659690]
 ['New England' 'Maine' 1450 1066 1339057]
 ['South Atlantic' 'Maryland' 4914 2230 

In [52]:
# An index of columns: the column names.
print(homelessness.columns)

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')


In [53]:
# An index for the rows: either row numbers or row names.
print(homelessness.index)

RangeIndex(start=0, stop=51, step=1)


In [54]:
homelessness_ind = homelessness.sort_values("individuals")
print(homelessness_ind.head())

                region         state  individuals  family_members  state_pop
50            Mountain       Wyoming          434             205     577601
34  West North Central  North Dakota          467              75     758080
7       South Atlantic      Delaware          708             374     965479
39         New England  Rhode Island          747             354    1058287
45         New England       Vermont          780             511     624358


In [56]:
homelessness_fam = homelessness.sort_values("family_members", ascending = False)
print(homelessness_fam.head())

                region          state  individuals  family_members  state_pop
32        Mid-Atlantic       New York        39827           52070   19530351
4              Pacific     California       109008           20964   39461588
21         New England  Massachusetts         6811           13257    6882635
9       South Atlantic        Florida        21443            9587   21244317
43  West South Central          Texas        19199            6111   28628666


In [22]:
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending = [True, False])
print(homelessness_reg_fam.head())

                region      state  individuals  family_members  state_pop
13  East North Central   Illinois         6752            3891   12723071
35  East North Central       Ohio         6929            3320   11676341
22  East North Central   Michigan         5209            3142    9984072
49  East North Central  Wisconsin         2740            2167    5807406
14  East North Central    Indiana         3776            1482    6695497


In [24]:
individuals = homelessness["individuals"]
print(individuals.head())


0      2570
1      1434
2      7259
3      2280
4    109008
Name: individuals, dtype: int64


In [26]:
state_fam = homelessness[["state", "family_members"]]
print(state_fam.head())

        state  family_members
0     Alabama             864
1      Alaska             582
2     Arizona            2606
3    Arkansas             432
4  California           20964


In [27]:
ind_state = homelessness[["state", "individuals"]]
print(ind_state.head())

        state  individuals
0     Alabama         2570
1      Alaska         1434
2     Arizona         7259
3    Arkansas         2280
4  California       109008


In [40]:
ind_gt_10k_filter = homelessness["individuals"] > 10000
print(ind_gt_10k_filter.head())

0    False
1    False
2    False
3    False
4     True
Name: individuals, dtype: bool


In [45]:
ind_gt_10k_ = homelessness[homelessness["individuals"] > 10000]
print(ind_gt_10k)

   region state  individuals  family_members  state_pop
0     NaN   NaN          NaN             NaN        NaN
1     NaN   NaN          NaN             NaN        NaN
2     NaN   NaN          NaN             NaN        NaN
3     NaN   NaN          NaN             NaN        NaN
4     NaN   NaN     109008.0             NaN        NaN
5     NaN   NaN          NaN             NaN        NaN
6     NaN   NaN          NaN             NaN        NaN
7     NaN   NaN          NaN             NaN        NaN
8     NaN   NaN          NaN             NaN        NaN
9     NaN   NaN      21443.0             NaN        NaN
10    NaN   NaN          NaN             NaN        NaN
11    NaN   NaN          NaN             NaN        NaN
12    NaN   NaN          NaN             NaN        NaN
13    NaN   NaN          NaN             NaN        NaN
14    NaN   NaN          NaN             NaN        NaN
15    NaN   NaN          NaN             NaN        NaN
16    NaN   NaN          NaN             NaN    

In [46]:
mountain_reg =  homelessness[homelessness['region'] == 'Mountain']
print(mountain_reg)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona         7259            2606    7158024
5   Mountain    Colorado         7607            3250    5691287
12  Mountain       Idaho         1297             715    1750536
26  Mountain     Montana          983             422    1060665
28  Mountain      Nevada         7058             486    3027341
31  Mountain  New Mexico         1949             602    2092741
44  Mountain        Utah         1904             972    3153550
50  Mountain     Wyoming          434             205     577601


In [49]:
fam_lt_1k_pac = homelessness[(homelessness['family_members'] < 1000) & (homelessness['region'] == "Pacific")]
print(fam_lt_1k_pac)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska         1434             582     735139


In [51]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

mojave_homelessness =  homelessness[homelessness['state'].isin(canu)]
print(mojave_homelessness)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona         7259            2606    7158024
4    Pacific  California       109008           20964   39461588
28  Mountain      Nevada         7058             486    3027341
44  Mountain        Utah         1904             972    3153550


In [59]:
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']
print(homelessness[['total', 'individuals', 'family_members']].head())

    total  individuals  family_members
0    3434         2570             864
1    2016         1434             582
2    9865         7259            2606
3    2712         2280             432
4  129972       109008           20964


In [64]:
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']
homelessness['percent_homeless'] = (homelessness['total'] / homelessness['state_pop']) * 100
print(homelessness[['total', 'state_pop', 'state', 'percent_homeless']].head())

    total  state_pop       state  percent_homeless
0    3434    4887681     Alabama          0.070258
1    2016     735139      Alaska          0.274234
2    9865    7158024     Arizona          0.137817
3    2712    3009733    Arkansas          0.090108
4  129972   39461588  California          0.329363


In [67]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"]
print(homelessness)

                region                 state  individuals  family_members  \
0   East South Central               Alabama         2570             864   
1              Pacific                Alaska         1434             582   
2             Mountain               Arizona         7259            2606   
3   West South Central              Arkansas         2280             432   
4              Pacific            California       109008           20964   
5             Mountain              Colorado         7607            3250   
6          New England           Connecticut         2280            1696   
7       South Atlantic              Delaware          708             374   
8       South Atlantic  District of Columbia         3770            3134   
9       South Atlantic               Florida        21443            9587   
10      South Atlantic               Georgia         6943            2556   
11             Pacific                Hawaii         4131            2399   

In [78]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"]
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness['indiv_per_10k'] > 20]
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values('indiv_per_10k', ascending = False)
# From high_homelessness_srt, select the state and indiv_per_10k cols
print(high_homelessness_srt [['state','indiv_per_10k']])

                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363
