In [1]:
import pandas as pd
import numpy as np

# get_dummies() function

Convert categorical variable into dummy/indicator variables.

In [11]:
s = pd.Series(list("abca"))
s

0    a
1    b
2    c
3    a
dtype: object

In [12]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [13]:
s1 = ['a', 'b', np.nan]
s1

['a', 'b', nan]

In [15]:
pd.get_dummies(s1)

Unnamed: 0,a,b
0,1,0
1,0,1
2,0,0


In [19]:
pd.get_dummies(s1, dummy_na=True)

Unnamed: 0,a,b,NaN
0,1,0,0
1,0,1,0
2,0,0,1


In [16]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]})
df

Unnamed: 0,A,B,C
0,a,b,1
1,b,a,2
2,a,c,3


In [22]:
pd.get_dummies(df)

Unnamed: 0,C,A_a,A_b,B_a,B_b,B_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


In [23]:
pd.get_dummies(df, prefix=["col1", "col2"])

Unnamed: 0,C,col1_a,col1_b,col2_a,col2_b,col2_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


In [34]:
pd.Series(list("abcaa"))

0    a
1    b
2    c
3    a
4    a
dtype: object

In [35]:
pd.get_dummies(pd.Series(list('abcaa')))

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,1,0,0


drop_first argument: bool, default False, Whether to get k-1 dummies out of k categorical levels by removing the first level.

In [36]:
pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)

Unnamed: 0,b,c
0,0,0
1,1,0
2,0,1
3,0,0
4,0,0


In [37]:
pd.get_dummies(pd.Series(list('abc')), dtype=float)

Unnamed: 0,a,b,c
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0


In [40]:
df = pd.DataFrame({"A": ["A0", "A1", "A2", "A3"],
                   "B": ["B0", "B1", "B2", "B3"],
                   "C": ["C0", "C1", "C2", "C3"],
                   "D": ["D0", "D1", "D2", "D3"]},
                  index=[0,1,2,3])

In [41]:
df

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [42]:
pd.get_dummies(df)

Unnamed: 0,A_A0,A_A1,A_A2,A_A3,B_B0,B_B1,B_B2,B_B3,C_C0,C_C1,C_C2,C_C3,D_D0,D_D1,D_D2,D_D3
0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0
3,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1


In [43]:
pd.get_dummies(df["A"])

Unnamed: 0,A0,A1,A2,A3
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1


In [44]:
df["A"]=pd.get_dummies(df["A"])

In [45]:
df

Unnamed: 0,A,B,C,D
0,1,B0,C0,D0
1,0,B1,C1,D1
2,0,B2,C2,D2
3,0,B3,C3,D3


In [17]:
df = pd.DataFrame({"gender": ["male", "female", "female", "male"],
                   "age": [17,34,43,23]})
df

Unnamed: 0,gender,age
0,male,17
1,female,34
2,female,43
3,male,23


In [51]:
pd.get_dummies(df)

Unnamed: 0,age,gender_female,gender_male
0,17,0,1
1,34,1,0
2,43,1,0
3,23,0,1


In [52]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,age,gender_male
0,17,1
1,34,0
2,43,0
3,23,1


In [56]:
df = pd.DataFrame({"gender": ["male", "female", "female", "male"],
                   "age": [17,34,43,23]})

In [58]:
df["gender"]=="male"

0     True
1    False
2    False
3     True
Name: gender, dtype: bool

In [62]:
for i in df:
    if df["gender"]=="male":
        return True
    else: 
        return False

SyntaxError: 'return' outside function (<ipython-input-62-dd8105f738ae>, line 3)

In [64]:
df = pd.DataFrame({"gender": ["male", "female", "female", "male"],
                   "age": [17,34,43,23]})

In [65]:
map_dict = {"male" : 1, "female" : 0}

In [66]:
df["original"] = df.gender.map(map_dict)

In [67]:
df

Unnamed: 0,gender,age,original
0,male,17,1
1,female,34,0
2,female,43,0
3,male,23,1


In [69]:
df.drop("gender", axis=1,inplace=True)

In [70]:
df

Unnamed: 0,age,original
0,17,1
1,34,0
2,43,0
3,23,1


In [5]:
df = pd.read_csv("https://bit.ly/3kGDuKx", na_values="?")
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [8]:
df.groupby("pclass")["survived"].sum()#what column do we want to group by ="pclass", what column do we want info returned on = "survived"

pclass
1    200
2    119
3    181
Name: survived, dtype: int64

In [9]:
df.groupby("pclass")[["survived", "fare"]].sum()

Unnamed: 0_level_0,survived,fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,200,28265.4043
2,119,5866.6374
3,181,9418.4452


In [11]:
df.groupby(["pclass", "sex"])["survived"].sum()

pclass  sex   
1       female    139
        male       61
2       female     94
        male       25
3       female    106
        male       75
Name: survived, dtype: int64

In [12]:
df.groupby(["pclass", "sex"], as_index=False)["survived"].sum()

Unnamed: 0,pclass,sex,survived
0,1,female,139
1,1,male,61
2,2,female,94
3,2,male,25
4,3,female,106
5,3,male,75


In [14]:
df.groupby(["pclass"])["survived"].agg(["sum", "mean"])

Unnamed: 0_level_0,sum,mean
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,200,0.619195
2,119,0.429603
3,181,0.255289


In [15]:
df["age_bins"] = pd.cut(df["age"], bins=3, labels=("young", "middle_age", "old"))

In [17]:
df.groupby(["age_bins"])["survived"].sum()

age_bins
young         199
middle_age    200
old            28
Name: survived, dtype: int64

How do I make my pandas DataFrame smaller and faster?

dataschool

In [19]:
drinks = pd.read_csv("http://bit.ly/drinksbycountry")
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


### info() df method

In [20]:
drinks.info() #memory usage: 9.2+ KB, + means it may be larger, it just sees object column, and it doesnt look every cell

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [21]:
drinks.info(memory_usage="deep")#memory usage: 30.5 KB---> here it sees

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [24]:
drinks.memory_usage()

Index                            128
country                         1544
beer_servings                   1544
spirit_servings                 1544
wine_servings                   1544
total_litres_of_pure_alcohol    1544
continent                       1544
dtype: int64

In [25]:
drinks.memory_usage(deep=True)

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [26]:
drinks.memory_usage(deep=True).sum()

31224

object columns takes up a lot of space

how we can reduce the space---> store strşngs as integers man

In [27]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

## category data type

introduced in: pandas 0.15 

converting object column to **category type**

In [30]:
drinks["continent"]= drinks.continent.astype("category")

In [31]:
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [32]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [33]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [34]:
drinks.memory_usage(deep=True)

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [35]:
df= pd.DataFrame({"ID": [100,101,102,103], "quality": ["good", "very good", "good", "excellent"]})

In [36]:
df

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [37]:
df.sort_values("quality")# it sorted according to alphabetical order

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


logical ordering

we'll define categorical data, and we're going to define **logical ordering of categories**

In [47]:
df["quality"] = pd.Categorical(df["quality"], categories=["good", "very good", "excellent"], ordered=True) #From pandas 0.25+
#df["quality"] = df.quality.astype("category", categories=["good", "very good", "excellent"], ordered=True)

In [42]:
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [44]:
df.sort_values("quality")# now it sorts in logical order

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [46]:
df.loc[df.quality > "good", :]

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent
