In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")
#To get a list of the data types stored in each column of our dataframe, we call the dtypes attribute (Section 1.2).

print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object


# Converting to String Objects

In [2]:
# convert the category sex column into a string dtype
tips['sex_str'] = tips['sex'].astype(str)
# Python has built-in str, float, int, complex, and bool types. However, you can also specify any dtype from the numpy library. If we look at the dtypes now, you will see the sex_str now has a dtype of object.

print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


# Converting to Numeric Values

In [3]:
# convert total_bill into a string
tips['total_bill'] = tips['total_bill'].astype(str)
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


In [4]:
# convert it back to a float
tips['total_bill'] = tips['total_bill'].astype(float)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


# The .to_numeric() Method

In [7]:
import numpy as np
# subset the tips data
tips_sub_miss = tips.head(10).copy()
# assign some 'missing' values
tips_sub_miss.loc[[1, 3, 5, 7], 'total_bill'] = np.nan
print(tips_sub_miss)

   total_bill   tip     sex smoker  day    time  size sex_str
0       16.99  1.01  Female     No  Sun  Dinner     2  Female
1         NaN  1.66    Male     No  Sun  Dinner     3    Male
2       21.01  3.50    Male     No  Sun  Dinner     3    Male
3         NaN  3.31    Male     No  Sun  Dinner     2    Male
4       24.59  3.61  Female     No  Sun  Dinner     4  Female
5         NaN  4.71    Male     No  Sun  Dinner     4    Male
6        8.77  2.00    Male     No  Sun  Dinner     2    Male
7         NaN  3.12    Male     No  Sun  Dinner     4    Male
8       15.04  1.96    Male     No  Sun  Dinner     2    Male
9       14.78  3.23    Male     No  Sun  Dinner     2    Male


In [8]:
print(tips_sub_miss.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


In [10]:
tips_sub_miss["total_bill"] = pd.to_numeric(tips_sub_miss["total_bill"], errors="coerce"
)

print(tips_sub_miss)

   total_bill   tip     sex smoker  day    time  size sex_str
0       16.99  1.01  Female     No  Sun  Dinner     2  Female
1         NaN  1.66    Male     No  Sun  Dinner     3    Male
2       21.01  3.50    Male     No  Sun  Dinner     3    Male
3         NaN  3.31    Male     No  Sun  Dinner     2    Male
4       24.59  3.61  Female     No  Sun  Dinner     4  Female
5         NaN  4.71    Male     No  Sun  Dinner     4    Male
6        8.77  2.00    Male     No  Sun  Dinner     2    Male
7         NaN  3.12    Male     No  Sun  Dinner     4    Male
8       15.04  1.96    Male     No  Sun  Dinner     2    Male
9       14.78  3.23    Male     No  Sun  Dinner     2    Male


In [11]:
print(tips_sub_miss.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


In [12]:
tips_sub_miss["total_bill"]=pd.to_numeric(
    tips_sub_miss["total_bill"], errors="coerce"
)
print(tips_sub_miss)

   total_bill   tip     sex smoker  day    time  size sex_str
0       16.99  1.01  Female     No  Sun  Dinner     2  Female
1         NaN  1.66    Male     No  Sun  Dinner     3    Male
2       21.01  3.50    Male     No  Sun  Dinner     3    Male
3         NaN  3.31    Male     No  Sun  Dinner     2    Male
4       24.59  3.61  Female     No  Sun  Dinner     4  Female
5         NaN  4.71    Male     No  Sun  Dinner     4    Male
6        8.77  2.00    Male     No  Sun  Dinner     2    Male
7         NaN  3.12    Male     No  Sun  Dinner     4    Male
8       15.04  1.96    Male     No  Sun  Dinner     2    Male
9       14.78  3.23    Male     No  Sun  Dinner     2    Male


In [13]:
print(tips_sub_miss.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


# Categorical Data

# Convert to Category
# 
# To convert a column into a categorical type, we pass category into the .astype() method.

In [14]:
# convert the sex column into a string object first
tips['sex'] = tips['sex'].astype('str')
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    object  
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(3), float64(2), int64(1), object(2)
memory usage: 10.8+ KB
None


In [15]:
# convert the sex column back into categorical data
tips['sex'] = tips['sex'].astype('category')
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   sex_str     244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB
None


# Manipulating Categorical Data

In [16]:

import pandas as pd

# Create employee performance data
data = pd.DataFrame({
    "Employee": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
    "Performance": ["Good", "Excellent", "Average", "Poor", "Good", "Average"]
})

# Define an ordered categorical type
performance_order = ["Poor", "Average", "Good", "Excellent"]

# Convert 'Performance' column to an ordered categorical type
data["Performance"] = pd.Categorical(data["Performance"], categories=performance_order, ordered=True)

# Display the dataset
print("\n🔹 Employee Performance Data:\n", data)

# 🔹 Get category codes (numerical representation)
print("\n🔹 Category Codes:\n", data["Performance"].cat.codes)

# 🔹 Sort employees by performance
sorted_data = data.sort_values("Performance")
print("\n🔹 Employees Sorted by Performance:\n", sorted_data)

# 🔹 Compare performance levels
print("\n🔹 Is Alice better than David?", data.loc[0, "Performance"] > data.loc[3, "Performance"])  # Good > Poor
print("🔹 Is Charlie worse than Bob?", data.loc[2, "Performance"] < data.loc[1, "Performance"])  # Average < Excellent



🔹 Employee Performance Data:
   Employee Performance
0    Alice        Good
1      Bob   Excellent
2  Charlie     Average
3    David        Poor
4      Eve        Good
5    Frank     Average

🔹 Category Codes:
 0    2
1    3
2    1
3    0
4    2
5    1
dtype: int8

🔹 Employees Sorted by Performance:
   Employee Performance
3    David        Poor
2  Charlie     Average
5    Frank     Average
0    Alice        Good
4      Eve        Good
1      Bob   Excellent

🔹 Is Alice better than David? False
🔹 Is Charlie worse than Bob? True


![text](img_2.png)