## Pokemon Dataset - Filters & Searches - Task

Work through the following program   
Try to understand what is happening in each cell and make extra comments for yourself if necessary

In [None]:
#Import Pandas - any other libraries that you need are usually placed at the top
import pandas as pd
import numpy as np

#Read in a CSV file and assign it to a variable called df - pokemon_data.csv
#df is commonly used as Pandas uses dataframes - but any variable name can be used
df = pd.read_csv('pokemon_data.csv')

#print out the contents of the dataframe
df

In [None]:
#display the first 3 lines of the DataFrame
df.head(3)

In [None]:
#display the last 5 lines of the DataFrame
df.tail(5)

In [None]:
#display the Dataframe headers (column headings)
df.columns

In [None]:
#examine the datatypes
df.dtypes

In [None]:
#check for null values in the DataFrame
df.isnull().sum()

In [None]:
#print the number of unique values in the Type 1 column
print(df['Type 1'].nunique())

#list the unique values
df['Type 1'].unique()

In [None]:
#print the number of unique values in the Type 2 column  
print(df['Type 2'].nunique())

#list the unique values
df['Type 2'].unique()

## Displaying the Content of Columns and Rows

In [None]:
#Display the contents of the HP column in the DataFrame - one way of doing this has been used. Provide another.
df['HP']

#Or
#df.loc[:, 'HP']

In [None]:
#Display the content of 3 columns - Name, Type 1, HP
df[['Name', 'Type 1', 'HP']]

In [None]:
#Display the content of specific rows - iloc means Index Location - this code displays rows 0 to 3 
df.iloc[0:4]

In [None]:
#Read a specific cell location (Row,Column) - observe how the code here is different from that in the cell above
df.iloc[2,1]

In [None]:
#What will this code display?
df.iloc[0,4]

In [None]:
#Based on previous examples in other tasks (see Social Network), 
  #for each row, print the Index, and the value in the Name column 
for index, row in df.iterrows():
    print(index, row['Name'])

In [None]:
#This code will also print the Index and the value in the Name column

#Iterate through the rows, printing out the contents of a specified column
print(df[['Name']])

## Sorting Data

In [None]:
#Sort the Name column in ascending order - do not save the changes
df.sort_values('Name', ascending=True)

In [None]:
#Sort the Type 1 column and HP column in ascending and descending order respectively - do not save the changes
df.sort_values(['Type 1', 'HP'], ascending=[True, False])

## Adding a New Total Column & Performing a Simple Addition Calculation

In [None]:
#Here you are creating a Total column, adding the values from several specified columns & storing the result in Total
#This line of code will directly modify the dataframe and does not need to be saved
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']

df.head(5)

In [None]:
#This is just a sum to check that the Total is correct - it is easy to make mistakes with formula so it's important to check
#that you get the answer that you expect

45+49+49+65+65+45 

In [None]:
#Delete the Total the column you just created and save the change
df = df.drop(columns=['Total'])

#Display the first 5 records in the DataFrame
df.head()

In [None]:
#This is a different way of doing the calculation above 
df['Total'] = df.iloc[:, 4:10].sum(axis=1)

#Display the top of the DataFrame
df.head(5)

In [None]:
#Based on an example from a previous task
  #Rearrange the columns so that the Total column appears immediately after the Type 2 column

# Get all columns as a list
columns = list(df.columns)

# Remove 'Total' if it's already in the list to avoid duplication
if 'Total' in columns:
    columns.remove('Total')

# Find the index of 'Type 2' and add 1 to get the position after it
index = columns.index('Type 2') + 1

# Insert 'Total' back into the list at the right position
columns.insert(index, 'Total')

# Reorder df using the new columns order
df = df[columns]

# Display the DataFrame
df.head()


In [None]:
#The problem with the technique above is that when you have a lot of columns you have a lot of typing to do. 
    ##Some datasets can easily contain 30 or more columns

#Comment out the code in the cell above and run the program from the top

#The lines of code below rearrange the order of your columns 
cols = list(df.columns)
df = df[cols[0:4] + [cols[-1]]+cols[4:12]]

#cols[0:4]: Selects the first four columns.
#[cols[-1]]: Adds the last column into the new order.
#cols[4:12]: Adds columns 4 to 11 from the original DataFrame to the new order.

#Display the top of the re-ordered DataFrame
df.head(5)

In [None]:
df.head()

In [None]:
#Using the list and index method demonstrated above and order the columns as follows:
    
#Comment out the code in the cell above and run the program from the top

#The lines of code below rearrange the order of your columns 
cols = list(df.columns)

# Reorder columns based on the specified indices
dfBad = df[cols[:5] + [cols[10]] + [cols[11]] + cols[8:10] + cols[6:8] + [cols[5]] + [cols[12]]]

#cols[:5] selects '#', 'Name', 'Type 1', 'Type 2', and 'Total'.
#[cols[10]] selects 'Speed'.
#[cols[11]] selects 'Generation'.
#cols[8:10] selects 'Sp. Atk' and 'Sp. Def'.
#cols[6:8] selects 'Attack' and 'Defense'.
#[cols[5]] selects 'HP'.
#[cols[12]] selects 'Legendary'

#Use code to put your columns in the required order 


#Display the top of the re-ordered DataFrame
dfBad.head()


## Filtering Data

**Using str.contains for string values**

In [None]:
#Display all rows where the Type 1 column contains the word Grass
df.loc[df['Type 1'].str.contains('Grass')]

In [None]:
#Search for a specific piece of text in a column - 

#This code will search for the text Mega is the Name column
df.loc[df['Name'].str.contains('Mega')]

In [None]:
#This code will display all values in the Name column that DO NOT contain the word Mega the ~ symbol is used rather than ! symbol
df.loc[~df['Name'].str.contains('Mega')]

In [None]:
#To search for different pieces of text you will have to import Regular Expressions - needed for 'or' and 'and' symbols 
 #Note: ordinarily - all new libraries should be kept at the top of the notebook
import re

In [None]:
#Check if the Type 1 column contains Fire or Grass
df.loc[df['Type 1'].str.contains('Fire|Grass', regex=True)]

In [None]:
#This code will NOT work as Python is case sensitive
df.loc[df['Type 1'].str.contains('fire|grass', regex=True)]

In [None]:
#Python is case sensitive, so here is a useful flag that means - ignore case
df.loc[df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)]

In [None]:
#using str.contains - Write a statement to:
  #check if the Type 1 column contains fire or grass OR the Type 2 column contains poison or ground
df.loc[(df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)) | (df['Type 2'].str.contains('poision|ground', flags=re.I, regex=True))]

In [None]:
#using str.contains - Write a statement to:
  #check if the Type 1 column contains fire or grass AND the Type 2 column contains poison or ground
df.loc[(df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)) & (df['Type 2'].str.contains('poision|ground', flags=re.I, regex=True))]


**Using Comparison Operators for text or numeric values**

In [None]:
#This code will filter all rows where the Type 1 column contains the word Grass
df.loc[df['Type 1'] == "Grass"]

In [None]:
#Filtering Data using Several Criteria
#We are not saving the results here but you can overwrite your exisiting dataframe or save to a new df
#It would just depend on your requirements

#This code will filter all records where Grass is in the Type 1 column AND Poison is in the Type 2 column 
  #AND the values in HP are greater than 70

df.loc[(df['Type 1'] == 'Grass') & (df['Type 2'] == 'Poison') & (df['HP'] > 70)]

In [None]:
#using comaprison operators  - Write a statement to:
  #check if the Type 1 column contains Fire or Grass AND the Type 2 column contains poison or ground AND HP > 70
df.loc[(df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)) & (df['Type 2'].str.contains('poision|ground', flags=re.I, regex=True)) & (df['HP'] > 70)]


## Conditional Changes

In [None]:
#If Fire appears in the Type 1 column - replace it with the text Flamer
df.loc[df['Type 1'] == 'Fire', 'Type 1'] = 'Flamer'

#Display the DataFrame
df

In [None]:
#If the Total is greater than 500, then modify the contents of 2 other columns
df.loc[df['Total'] > 500, ['Generation','Legendary']] = [9, 'True']

#Display the DataFrame
df

In [None]:
#If Poison appears in the Type 2 column - replace it with the text Poisonous
df.loc[df['Type 1'] == 'Poison', 'Type 1'] = 'Poisonous'

#Display the DataFrame
df


In [None]:
#If Psychic appears in the Type 1 column - replace HP with 90 and Defense with 70
df.loc[df['Type 1'] == 'Psychic', ['HP', 'Defense']] = [90, 70]

#Display the DataFrame
df.loc[df['Type 1'] == 'Psychic']

## Aggregate Statistics (Groupby)


In [None]:
#This code will group together similar values in a specified column
#It will find the mean for each column
#df.describe()
#df.groupby(['Type 1']).mean()
df_grouped = df.groupby(['Type 1']).mean(numeric_only=True).round(0).astype(int)
df_grouped


In [None]:
#This code will sort your results in descending order by a specified criteria
df.groupby(['Type 1']).mean().sort_values('Defense', ascending=False)

#In the code here - mean can be replaced by sum or count - experiement if you wish

In [None]:
#Create a new column called 'count' with a value of 1 in each cell
df['count'] = 1
df

In [None]:
# Adjust pandas display options
pd.set_option('display.max_rows', 20)  # This line will allow displaying all rows

# Count the occurrences and rename a column for
df.groupby(['Type 1', 'Type 2']).count()['count']

## Handle Null Values and Object Data

**Take appropriate steps to handle the null values**

In [None]:
#Check for null values again
df.isnull().sum()

In [None]:
#Deal with Null values in the Type 2 columns - there are so many. What should we do?
#We can remove the second type field however it is critical to pokemon as secondary typings are important to the game.
df['Type 2'].fillna('None', inplace=True)

In [None]:
df

**How should we handle the Name column? What things do we need to consider?**

In [None]:
#Name column - describe the DataFrame to View the Name column
#df.describe(include="O")

#Take appropriate action
# Check the descriptive statistics of the 'Name' column
name_description = df['Name'].describe(include="O")
print(name_description)

# Check for missing values in the 'Name' column
missing_names = df['Name'].isnull().sum()
print("Number of missing values in 'Name' column:", missing_names)

# Check the length of names
name_lengths = df['Name'].str.len()
print("Maximum length of name:", name_lengths.max())
print("Minimum length of name:", name_lengths.min())

# Check for duplicates
duplicate_names = df.duplicated(subset=['Name']).sum()
print("Number of duplicate names:", duplicate_names)



**Take appropriate steps to make the Type 1 column Numeric**

In [None]:
#Check the datatypes again
df.dtypes

In [None]:
#Make Type 1 column numeric - use get dummies
# Make 'Type 1' column numeric using one-hot encoding
df = pd.get_dummies(df, columns=['Type 1'], prefix='Type1')

# Check the datatypes after one-hot encoding
print(df.dtypes)


In [None]:
df.dtypes

In [None]:
#Delete the Type 1 column
# Delete the original 'Type 1' column if needed
del df['Type 1']

In [None]:
#Check the datatypes in the DataFrame
# Display the column names in the DataFrame
df

**Take appropriate steps to change the Bool columns to Numeric**

In [None]:
#Use Mapping to change the values in the Legendary column
df.Legendary = df.Legendary.map({True: 1, False: 0})

In [None]:
#Check that all columns are now numeric
pd.set_option('display.max_rows', 40)
df.dtypes

In [None]:
# Get the boolean columns
bool_columns = df.select_dtypes(include=bool).columns

# Convert boolean columns to numeric (0 for False, 1 for True)
df[bool_columns] = df[bool_columns].astype(int)

# Check that all columns are now numeric
print(df.dtypes)


In [None]:
df

In [None]:
# Initialize LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Encode 'Name' and 'Type 2' columns
df['Name'] = label_encoder.fit_transform(df['Name'])
df['Type 2'] = label_encoder.fit_transform(df['Type 2'])

In [None]:
df

In [None]:
df.dtypes