# Introduction To Pandas

In [None]:
import pandas as pd

#### 1.**Main Data Types**

There are 2 main data types in pandas: series and DataFrame

In [None]:
# Series (1-dimensional) - Series takes in a list.
series = pd.Series(["cat", "bird", "tiger", "lion"])

In [None]:
series2 = pd.Series(["catto", "paggy", "tigrera", "simba"])

In [None]:
# DataFrame (2-dimensional) - dataframe takes in a dictionary
dataFrame = pd.DataFrame({"Animal": series, "name": series2})

We can think of dataframe as a collection of series - series connected together in line to make a frame (table)

#### 2.**Importing csv files**

In [None]:
# Import data 
car_sales = pd.read_csv("car-sales.csv")

read_csv converts a csv file to a data-frame

read_csv can also import directly from an URL.

**NOTE**: column axis = 1, row axis = 0 in data-frame 

#### 3.**Exporting data-frame**

In [None]:
# When we export the data-frame we dont want to include the index (so index=False) 
car_sales.to_csv("exported-car-sales-data.csv", index=False)

#### 4.**Describing Data**

In [None]:
# Data types of all the columns in the data-frame 
car_sales.dtypes

In [None]:
# Get the columns of the data (returns a list)
columns = car_sales.columns

In [None]:
# Index info (start, end and step)
index = car_sales.index

In [None]:
# Stats about the numberical columns in the data (mean, std, percentile, etc..)
stats = car_sales.describe()

In [None]:
# Info about the  (general info like dtypes, index, memory, etc..)
info = car_sales.info()

In [None]:
# Mean of a specific numeric series in the data-frame
car_sales.Doors.mean()

In [None]:
# sum of all the  series in the data-frame (inc no-numeric series)
car_sales.sum()

In [None]:
# sum of a specific numeric series in the data-frame
car_sales.Doors.sum()

we can perform various other statistical operations on our data 

In [None]:
# length of the data-frame (rows)
len(car_sales)

#### 4.View and Select Data 

##### 4.1 View

In [None]:
# view first few rows of the dat-frame (default  =  5) - head(8) also takes in a number
car_sales.head()

In [None]:
# view last few rows of the dat-frame (default  =  5) - last(7) also takes in a number
car_sales.tail()

##### 4.2 Indexing data

In [None]:
animals = pd.Series(["cat", "bird", "tiger", "lion"], index=[2,4,6,8])
animals;

In [None]:
# Series Index - manual
car_sales = pd.read_csv("car-sales.csv")

doors = car_sales.Doors
doors.index = [1,2,3,4,5,6,7,8,9,10]
doors;                

In [None]:
# Series Index - auto (using range and len)
car_sales = pd.read_csv("car-sales.csv")

doors = car_sales.Doors
index = range(1, len(car_sales)+1)
doors.index = index
doors;

By default data-frame adds a index starting from 0

In [None]:
# Setting custom index to data-frame

In [None]:
# DataFrame custom Index - (method - 1)
car_sales = pd.read_csv("car-sales.csv")

car_sales = car_sales.set_index(pd.Index(range(1, len(car_sales)+1)))
car_sales;

In [None]:
# DataFrame custom Index - (method - 2)
car_sales = pd.read_csv("car-sales.csv")

car_sales.index = range(1, len(car_sales)+1)
car_sales;

##### 4.3 LOC and ILOC

a. loc refers to the exact index number associated with a row (we can change this index 

In [None]:
# loc  - series
animals = pd.Series(["cat", "bird", "tiger", "lion"], index=[2,4,2,8])
animals;
animals.loc[2];
animals.loc[8];

In [None]:
# loc - dataframe
# loc depends on how we set the index - default start from 0
car_sales = pd.read_csv("car-sales.csv")

car_sales.loc[3];

b. iloc refers to default index set by the dataframe (even if we add a cutom index)

In [None]:
# iloc - series
animals = pd.Series(["cat", "bird", "tiger", "lion"], index=[2,4,2,8])

animals.iloc[0];

In [None]:
# loc - dataframe
# iloc depends on on the default index even if we change the index - default start from 0
car_sales = pd.read_csv("car-sales.csv")
car_sales.index = range(2, len(car_sales)+2) 

car_sales.iloc[0];

##### 4.4 Slicing using iloc

In [None]:
# slicing - iloc - series
animals = pd.Series(["cat", "bird", "tiger", "lion"], index=[2,4,2,8])

# starting from 0 up to 3 (nit including 3) - ( 0 <= items < 3 )
animals.iloc[:3];

In [None]:
# slicing - iloc - series
car_sales = pd.read_csv("car-sales.csv")

# starting from 1 up to 3 (not including 3) - ( 1 <= items < 3 )
car_sales.iloc[1:3];

##### 4.5 Selecting columns 

In [None]:
# method-1
car_sales["Make"];

In [None]:
# method-2
car_sales.Make

# dot notation wont work if we have a space eg: "Odameter (km)" - use method 1

##### 4.6 Querying

In [None]:
# Equal 
car_sales[ car_sales.Make == "Toyota" ];

In [None]:
# greater/ less than

car_sales[ car_sales.Doors >= 4];

##### 4.7 Comparing

In [None]:
# comparing 2 columns - crosstab
pd.crosstab( car_sales.Make,  car_sales.Colour);

In [None]:
# comapring more than 2 columns - groupby

# Load the dataset
car_sales = pd.read_csv("car-sales.csv")

# Group by the 'Make' column and calculate the mean for each group
mean_sales_by_make = car_sales.groupby("Make").mean(numeric_only=True)

# Display the result
mean_sales_by_make;

##### 4.8 Plotting

In [None]:
car_sales["Odometer (KM)"].plot();

In [None]:
car_sales["Odometer (KM)"].hist();

#### 5.Cleaning and converting dtypes

In [None]:
car_sales = pd.read_csv("car-sales.csv")

# Price column/series type 
car_sales.Price.dtypes;

In [None]:
# step-1: Remove non-numeric characters (dollar sign, commas) - cleaning
car_sales['Price'] = car_sales['Price'].str.replace(r'[\$\,]', '', regex=True)
car_sales.Price;

The `r` before a string indicates a raw string literal, which tells Python to treat backslashes as literal characters, making regular expressions easier to read.


In [None]:
# step-2: Convert the 'Price' column to numeric and remove the '.00' by converting it to an float - convering
car_sales['Price'] = pd.to_numeric(car_sales['Price'], errors='coerce', downcast='float')
car_sales.Price;

- **`errors='coerce'`**: Any value that cannot be converted to a number (such as a string that doesn't represent a valid number) will be replaced with `NaN` (Not a Number).
- **`errors='raise'`**: This is the default behavior. It raises an error if any values can't be converted.
- **`errors='ignore'`**: It leaves the problematic values as they are (i.e., if the value can't be converted, it remains in its original form).


The `downcast` argument in `pd.to_numeric()` is used to reduce the memory usage by converting the data to a smaller numeric type. You can specify:
- **`'integer'`**: Downcast to the smallest integer type.
- **`'float'`**: Downcast to the smallest float type.
- **`None`** (default): No downcasting.

Example:
```python
data_converted = pd.to_numeric(data, downcast='integer')


In [None]:
# Convert float to an integer - (opt - converting)
car_sales['Price'] = car_sales['Price'].astype(int)
car_sales.Price;

#### 6.Data Manipulation

In [None]:
# import data 
car_sales_missing = pd.read_csv("data/car-sales-missing-data.csv")

In [None]:
# index - review (opt)
car_sales_missing.index = range(1, len(car_sales)+1)

#### 6.0 Pandas and `NaN` for Missing Values

In Pandas, missing values are represented by `NaN` (Not a Number). It is used to indicate undefined or missing data in DataFrames or Series.

- `NaN` is not equal to any value, including itself (`NaN != NaN`).
- Functions like `isna()`, `fillna()`, and `dropna()` help handle missing values.

#### Example:
```python
import pandas as pd
import numpy as np

df = pd.DataFrame({'A': [1, 2, np.nan, 4]})
df.isna()  # Check for NaN values


#### 6.1 Reassigning Modified Values in Pandas

There are two ways to reassign modified values to a DataFrame:

1. **Using the `=` operator**: Directly assigns the modified DataFrame or Series.
   ```python
   df['A'] = df['A'] * 2

2. **Using `inplace=True` in pandas (if supported)**

    The `inplace=True` parameter modifies a DataFrame directly without returning a new one.

    Example: Dropping Missing Values

    ```python
    import pandas as pd
    
    # Sample DataFrame
    df = pd.DataFrame({'A': [1, 2, None, 4], 'B': [None, 2, 3, 4]})
    
    # Drop rows with missing values in place
    df.dropna(inplace=True)
    
    print(df)


#### 6.2 Dealing with NaN values

a. Replaceing NaN values

In [None]:
# method-1: fill the NaN with mean values - (not a good approach)
car_sales_missing["Odometer"];

In [None]:
# cal mean and  using inPlace
mean = car_sales_missing["Odometer"].mean()
car_sales_missing["Odometer"].fillna(mean, inplace=True);

b. Dropping NaN values

In [None]:
# method - 2: droping rows with Nan values
car_sales_missing.dropna(inplace=False);

#### 6.3 String manipulation

In [None]:
# lower case
car_sales_missing.Make.str.lower();

In [None]:
# upper case
car_sales_missing.Make.str.upper();

#### 6.4 creating columns

#### Length Mismatch in Pandas When using Series

When assigning a new column in a DataFrame, the behavior differs based on whether you use a `Series` or a `list`:

- **Using a `Series`**:  
  If the length of the `Series` does not match the length of the DataFrame's index, missing rows are filled with `NaN`.
  ```python
  import pandas as pd
  df = pd.DataFrame({"A": [1, 2, 3]})
  series = pd.Series([10, 20])  # Length is shorter
  df["B"] = series  # Remaining rows are filled with NaN
  print(df)


#### Length Mismatch When Using a List in Pandas

When assigning a new column in a Pandas DataFrame using a `list`, the length of the list **must match** the length of the DataFrame's index. If it doesn't, Pandas raises a `ValueError`.

#### Example:
```python
import pandas as pd
df = pd.DataFrame({"A": [1, 2, 3]})

# Incorrect: List length does not match DataFrame index
df["B"] = [10, 20]  # Raises ValueError


In [None]:
# 1.Column from series
seats_column = pd.Series([5,4,3,4,2])

In [None]:
car_sales = pd.read_csv("car-sales.csv")

# Add column/series to the data-frame
car_sales['Seats'] = seats_column

#### Attribute Access Warning in Pandas

Pandas does not allow creating or modifying columns using attribute-style access (e.g., `df.column_name = value`). Instead, use bracket notation:

##### Correct Syntax:
```python
df['column_name'] = value


In [None]:
# view the new data-frame
car_sales;

In [None]:
# fill the NaN values - review
car_sales.fillna({"Seats": 4}, inplace=True)
                
# car_sales["Seats"].fillna(4, inplace=True)
# - no longer applicable in pandas 3.0

car_sales;

#### Chained Assignment Warning in Pandas

When using `inplace=True` on a column, Pandas raises a warning because chained assignments create intermediate objects, which behave as copies. This will be deprecated in Pandas 3.0.

#### Correct Approaches:
1. **Reassign the result back to the column**:
   ```python
   df["column"] = df["column"].fillna(value)

   or

   car_sales.fillna({"Seats": 4}, inplace=True)



In [None]:
# 2. column from a list
fuel_econmy = [6.4, 3, 5.6, 5.5,3,43,56,7,8,9]

In [None]:
car_sales["Fuel per 100km"] = fuel_econmy

In [None]:
car_sales;

In [None]:
# 3. Creating a column using other columns
car_sales["total fuel used"] = car_sales["Odometer (KM)"]/100 * car_sales["Fuel per 100km"]

In [None]:
car_sales;

In [None]:
# 4. Creating a column from a singlr value
car_sales["Total wheels"] = 4

In [None]:
car_sales;

In [None]:
car_sales["passed safety"] = True

In [284]:
car_sales;

In [None]:
car_sales.dtypes;

#### 6.5 Removing a column

In [285]:
car_sales = car_sales.drop("passed safety", axis=1)

In [288]:
car_sales;

#### 6.6 Randomizer

In [306]:
car_sales= pd.read_csv("car-sales.csv")

In [307]:
# Shuffle and select a certain number of rows
car_sales_shuffled = car_sales.sample(frac=1)
car_sales_shuffled;

#### `frac` Parameter in `sample`

- **Purpose**: Specifies the fraction of the DataFrame to sample.
- **Value**: A float between `0` and `1` (e.g., `frac=0.5` samples 50% of rows).
- **Example**:
  ```python
  car_sales.sample(frac=1)  # Randomly shuffles all rows (100% of the DataFrame)


#### Sampling a Subset of Data with `.sample`

When working with large datasets, processing all rows can be inefficient. Use `.sample()` with the `frac` parameter to work on a smaller subset.

- **Purpose**: Select a fraction of the DataFrame for efficient testing or analysis.
- **Example**:
  ```python
  # Work on 1% of a 2 million-row dataset
  sample_data = df.sample(frac=0.01)


#### 6.7 Reset 

In [310]:
# reset the index
car_sales_shuffled.reset_index(drop=True, inplace=True)

By default drop is false, so the shuffled index is not droped, set it to true

In [312]:
car_sales_shuffled;

#### 6.8 Lambda

In [313]:
# Lets convert km to miles
car_sales["miles"] = car_sales["Odometer (KM)"].apply(lambda speed: speed/1.6)

In [315]:
car_sales;

# The End