# Series and Columns

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
titanic = pd.read_csv("titanic.csv")
type(titanic["age"])

pandas.core.series.Series

In [3]:
titanic["survived"].sum()

500

In [4]:
houses = pd.read_csv("kc_house_data.csv")
houses["price"].max()

7700000.0

In [5]:
houses["price"].min()

75000.0

## numeric_only=True

In [7]:
houses.sum(numeric_only=True)

id               9.899406e+13
price            1.167293e+10
bedrooms         7.285400e+04
bathrooms        4.570625e+04
sqft_living      4.495287e+07
sqft_lot         3.265069e+08
floors           3.229650e+04
waterfront       1.630000e+02
view             5.064000e+03
condition        7.368800e+04
grade            1.654880e+05
sqft_above       3.865249e+07
sqft_basement    6.300385e+06
yr_built         4.259933e+07
yr_renovated     1.824186e+06
zipcode          2.119759e+09
lat              1.027915e+06
long            -2.641409e+06
sqft_living15    4.293536e+07
sqft_lot15       2.759646e+08
dtype: float64

In [10]:
houses.shape

(21613, 21)

In [12]:
houses["bedrooms"].shape

(21613,)

In [16]:
# To see the underlying values of a series
netflix = pd.read_csv("netflix_titles.csv",sep="|")
netflix["title"].values
# This gives the underlying array.

array(['Dick Johnson Is Dead', 'Blood & Water', 'Ganglands', ...,
       'Zombieland', 'Zoom', 'Zubaan'], dtype=object)

In [17]:
# To print labels/Index associated with the column.
netflix["title"].index
# Prints the index of title

RangeIndex(start=0, stop=8807, step=1)

In [18]:
# but,
netflix.min(numeric_only=True).index

Index(['Unnamed: 0', 'release_year'], dtype='object')

In [19]:
# This has a different output because if we see the index of min
netflix.min()
# left is index, right is values

  netflix.min()


Unnamed: 0                                                      0
show_id                                                        s1
type                                                        Movie
title                                                      #Alive
release_year                                                 1925
listed_in                                      Action & Adventure
description     "Bridgerton" cast members share behind-the-sce...
dtype: object

## Important Series Methods

In [22]:
titanic.describe(include="all")

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309.0,1309.0,1309,1309,1309,1309.0,1309.0,1309,1309.0,1309,1309,1309,1309,1309
unique,,,1307,2,99,,,929,282.0,187,4,28,122,370
top,,,"Connolly, Miss. Kate",male,?,,,CA. 2343,8.05,?,S,?,?,?
freq,,,2,843,263,,,11,60.0,1014,914,823,1188,564
mean,2.294882,0.381971,,,,0.498854,0.385027,,,,,,,
std,0.837836,0.486055,,,,1.041658,0.86556,,,,,,,
min,1.0,0.0,,,,0.0,0.0,,,,,,,
25%,2.0,0.0,,,,0.0,0.0,,,,,,,
50%,3.0,0.0,,,,0.0,0.0,,,,,,,
75%,3.0,1.0,,,,1.0,0.0,,,,,,,


## unique() and nunique()

In [23]:
netflix["rating"].unique()
# Print all unique ratings in netflix rating series

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [24]:
netflix["rating"].nunique()

17

In [None]:
By default, that NaN is not connected.
By defaults, it drops null values

In [26]:
netflix["rating"].nunique(dropna=True)

17

In [27]:
netflix["rating"].nunique(dropna=False)

18

## nlargest & nsmallest

In [33]:
bs = pd.read_csv("bestsellers.csv")

In [36]:
bs["User Rating"].nlargest()
# prints top 5 highest price by default

40    4.9
41    4.9
81    4.9
82    4.9
83    4.9
Name: User Rating, dtype: float64

In [38]:
bs["User Rating"].nsmallest(10)
# Prints 10 smallest user rating

353    3.3
132    3.6
106    3.8
107    3.8
22     3.9
392    3.9
393    3.9
135    4.0
136    4.0
137    4.0
Name: User Rating, dtype: float64

In [39]:
# Note: This method is faster than sort
# To print rows with all the columns
houses.nlargest(10,["price"])
# prints dataframes with all columns and 10 rows sorted by price

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7252,6762700020,20141013T000000,7700000.0,6,8.0,12050,27600,2.5,0,3,...,13,8570,3480,1910,1987,98102,47.6298,-122.323,3940,8800
3914,9808700762,20140611T000000,7062500.0,5,4.5,10040,37325,2.0,1,2,...,11,7680,2360,1940,2001,98004,47.65,-122.214,3930,25449
9254,9208900037,20140919T000000,6885000.0,6,7.75,9890,31374,2.0,0,4,...,13,8860,1030,2001,0,98039,47.6305,-122.24,4540,42730
4411,2470100110,20140804T000000,5570000.0,5,5.75,9200,35069,2.0,0,0,...,13,6200,3000,2001,0,98039,47.6289,-122.233,3560,24345
1448,8907500070,20150413T000000,5350000.0,5,5.0,8000,23985,2.0,0,4,...,12,6720,1280,2009,0,98004,47.6232,-122.22,4600,21750
1315,7558700030,20150413T000000,5300000.0,6,6.0,7390,24829,2.0,1,4,...,12,5000,2390,1991,0,98040,47.5631,-122.21,4320,24619
1164,1247600105,20141020T000000,5110800.0,5,5.25,8010,45517,2.0,1,4,...,12,5990,2020,1999,0,98033,47.6767,-122.211,3430,26788
8092,1924059029,20140617T000000,4668000.0,5,6.75,9640,13068,1.0,1,4,...,12,4820,4820,1983,2009,98040,47.557,-122.21,3270,10454
2626,7738500731,20140815T000000,4500000.0,5,5.5,6640,40014,2.0,1,4,...,12,6350,290,2004,0,98155,47.7493,-122.28,3030,23408
8638,3835500195,20140618T000000,4489000.0,4,3.0,6430,27517,2.0,0,0,...,12,6430,0,2001,0,98004,47.6208,-122.219,3720,14592


In [41]:
houses.nlargest(10,["bedrooms","bathrooms"])
# Print houses dataframe with sorted bedrooms from largest and then sorted bathrooms from largest.

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15870,2402100895,20140625T000000,640000.0,33,1.75,1620,6000,1.0,0,0,...,7,1040,580,1947,0,98103,47.6878,-122.331,1330,4700
8757,1773100755,20140821T000000,520000.0,11,3.0,3000,4960,2.0,0,0,...,7,2400,600,1918,1999,98106,47.556,-122.363,1420,4960
13314,627300145,20140814T000000,1148000.0,10,5.25,4590,10920,1.0,0,2,...,9,2500,2090,2008,0,98004,47.5861,-122.113,2730,10400
19254,8812401450,20141229T000000,660000.0,10,3.0,2920,3745,2.0,0,0,...,7,1860,1060,1913,0,98105,47.6635,-122.32,1810,3745
15161,5566100170,20141029T000000,650000.0,10,2.0,3610,11914,2.0,0,0,...,7,3010,600,1958,0,98006,47.5705,-122.175,2040,11914
8546,424049043,20140811T000000,450000.0,9,7.5,4050,6504,2.0,0,0,...,7,4050,0,1996,0,98144,47.5923,-122.301,1448,3866
4096,1997200215,20140507T000000,599999.0,9,4.5,3830,6988,2.5,0,0,...,7,2450,1380,1938,0,98103,47.6927,-122.338,1460,6291
6079,9822700190,20140808T000000,1280000.0,9,4.5,3650,5000,2.0,0,0,...,8,2530,1120,1915,2010,98105,47.6604,-122.289,2510,5000
16844,8823900290,20150317T000000,1400000.0,9,4.0,4620,5508,2.5,0,0,...,11,3870,750,1915,0,98105,47.6684,-122.309,2710,4320
4235,2902200015,20150106T000000,700000.0,9,3.0,3680,4400,2.0,0,0,...,7,2830,850,1908,0,98102,47.6374,-122.324,1960,2450


### keep=all

In [42]:
# keep=all prints every row
titanic["pclass"].nlargest(5,keep="all")

600     3
601     3
602     3
603     3
604     3
       ..
1304    3
1305    3
1306    3
1307    3
1308    3
Name: pclass, Length: 709, dtype: int64

## count()

In [44]:
# Prints total present value of rows every column of data.
netflix.count()

Unnamed: 0      8807
show_id         8807
type            8807
title           8807
director        6173
cast            7982
country         7976
date_added      8797
release_year    8807
rating          8803
duration        8804
listed_in       8807
description     8807
dtype: int64